如何根据 r 中的其他变量更新变量的信息?
How to update information for a variable depending on other variable in r?
df_input
是我拥有的数据框,我想将其转换为 df_output
.
比如2001-2003年是assembly=1,我们在2001年有一个赢家。这意味着只要集会不变,我们就有一个赢家。
df_input <- data.frame(winner = c(1,0,0,0,2,0,0,0,1,0,0,0,0),
party = c("A",0,0,0,"B",0,0,0,"C",0,0,0,0),
assembly= c(1,1,1,2,2,2,3,3,3,3,4,4,4),
year = c(2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013))
df_output <- data.frame(winner = c(1,1,1,0,2,2,0,0,1,1,0,0,0),
party = c("A","A","A",0,"B","B",0,0,"C","C",0,0,0),
assembly= c(1,1,1,2,2,2,3,3,3,3,4,4,4),
year = c(2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013))
如何根据“组装”更新与获奖者栏中相同的信息?
编辑:如果有一个额外的字符串变量“party”怎么办?看编辑:
执行此代码后出现以下错误:
df_output <- df_input %>%
mutate(df_input$party = if_else(is.na(df_input$party)==FALSE, df_input$party, NA_real_)) %>%
group_by(assembly) %>%
fill(df_input$party) %>%
ungroup() %>%
replace_na(list(df_input$party = 0))
错误:
Error: unexpected '=' in:
" ungroup() %>%
replace_na(list(df_input$party ="
一种选择是像这样使用 tidyr::fill
:
library(dplyr)
library(tidyr)
df_input %>%
mutate(winner = if_else(winner > 0, winner, NA_real_)) %>%
group_by(assembly) %>%
fill(winner) %>%
ungroup() %>%
replace_na(list(winner = 0))
#> # A tibble: 13 × 3
#> winner assembly year
#> <dbl> <dbl> <dbl>
#> 1 1 1 2001
#> 2 1 1 2002
#> 3 1 1 2003
#> 4 0 2 2004
#> 5 2 2 2005
#> 6 2 2 2006
#> 7 0 3 2007
#> 8 0 3 2008
#> 9 1 3 2009
#> 10 1 3 2010
#> 11 0 4 2011
#> 12 0 4 2012
#> 13 0 4 2013
这是 cumsum
和 ave
的基本 R 方式。
请注意 R 4.1.0 中引入的新 lambda 函数 \(x)
的使用。如果出现错误,请使用较旧的 function(x)
.
with(df_input, ave(winner, assembly, FUN = \(x){
y <- cumsum(x != 0) != 0
if(any(y)) x[y] <- x[min(which(y))]
x
}))
# [1] 1 1 1 0 2 2 0 0 1 1 0 0 0
只需将结果分配回列 winner
。
df_output <- df_input
df_output$winner <- with(df_output, ave(winner, assembly, FUN = \(x){
y <- cumsum(x != 0) != 0
if(any(y)) x[y] <- x[min(which(y))]
x
}))
编辑
遵循 ,这是更简单的 cummax
解决方案。
with(df_input, ave(winner, assembly, FUN = cummax))
更新:见评论:
library(dplyr)
df_input %>%
group_by(assembly) %>%
mutate(winner = case_when(first(winner) > 0 ~ first(winner),
lag(winner, default=0) > winner ~ lag(winner),
TRUE ~ winner))
winner assembly year
<dbl> <dbl> <dbl>
1 1 1 2001
2 1 1 2002
3 1 1 2003
4 0 2 2004
5 2 2 2005
6 2 2 2006
7 0 3 2007
8 0 3 2008
9 1 3 2009
10 1 3 2010
11 0 4 2011
12 0 4 2012
13 0 4 2013
第一个答案(不考虑第3行)
我们可以在按 assembly
分组后使用 lag
函数
library(dplyr)
df_input %>%
group_by(assembly) %>%
mutate(winner = ifelse(lag(winner, default = 0) > winner, lag(winner), winner))
Groups: assembly [4]
winner assembly year
<dbl> <dbl> <dbl>
1 1 1 2001
2 1 1 2002
3 0 1 2003
4 0 2 2004
5 2 2 2005
6 2 2 2006
7 0 3 2007
8 0 3 2008
9 1 3 2009
10 1 3 2010
11 0 4 2011
12 0 4 2012
13 0 4 2013
df_input
是我拥有的数据框,我想将其转换为 df_output
.
比如2001-2003年是assembly=1,我们在2001年有一个赢家。这意味着只要集会不变,我们就有一个赢家。
df_input <- data.frame(winner = c(1,0,0,0,2,0,0,0,1,0,0,0,0),
party = c("A",0,0,0,"B",0,0,0,"C",0,0,0,0),
assembly= c(1,1,1,2,2,2,3,3,3,3,4,4,4),
year = c(2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013))
df_output <- data.frame(winner = c(1,1,1,0,2,2,0,0,1,1,0,0,0),
party = c("A","A","A",0,"B","B",0,0,"C","C",0,0,0),
assembly= c(1,1,1,2,2,2,3,3,3,3,4,4,4),
year = c(2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013))
如何根据“组装”更新与获奖者栏中相同的信息?
编辑:如果有一个额外的字符串变量“party”怎么办?看编辑:
执行此代码后出现以下错误:
df_output <- df_input %>%
mutate(df_input$party = if_else(is.na(df_input$party)==FALSE, df_input$party, NA_real_)) %>%
group_by(assembly) %>%
fill(df_input$party) %>%
ungroup() %>%
replace_na(list(df_input$party = 0))
错误:
Error: unexpected '=' in:
" ungroup() %>%
replace_na(list(df_input$party ="
一种选择是像这样使用 tidyr::fill
:
library(dplyr)
library(tidyr)
df_input %>%
mutate(winner = if_else(winner > 0, winner, NA_real_)) %>%
group_by(assembly) %>%
fill(winner) %>%
ungroup() %>%
replace_na(list(winner = 0))
#> # A tibble: 13 × 3
#> winner assembly year
#> <dbl> <dbl> <dbl>
#> 1 1 1 2001
#> 2 1 1 2002
#> 3 1 1 2003
#> 4 0 2 2004
#> 5 2 2 2005
#> 6 2 2 2006
#> 7 0 3 2007
#> 8 0 3 2008
#> 9 1 3 2009
#> 10 1 3 2010
#> 11 0 4 2011
#> 12 0 4 2012
#> 13 0 4 2013
这是 cumsum
和 ave
的基本 R 方式。
请注意 R 4.1.0 中引入的新 lambda 函数 \(x)
的使用。如果出现错误,请使用较旧的 function(x)
.
with(df_input, ave(winner, assembly, FUN = \(x){
y <- cumsum(x != 0) != 0
if(any(y)) x[y] <- x[min(which(y))]
x
}))
# [1] 1 1 1 0 2 2 0 0 1 1 0 0 0
只需将结果分配回列 winner
。
df_output <- df_input
df_output$winner <- with(df_output, ave(winner, assembly, FUN = \(x){
y <- cumsum(x != 0) != 0
if(any(y)) x[y] <- x[min(which(y))]
x
}))
编辑
遵循 cummax
解决方案。
with(df_input, ave(winner, assembly, FUN = cummax))
更新:见评论:
library(dplyr)
df_input %>%
group_by(assembly) %>%
mutate(winner = case_when(first(winner) > 0 ~ first(winner),
lag(winner, default=0) > winner ~ lag(winner),
TRUE ~ winner))
winner assembly year
<dbl> <dbl> <dbl>
1 1 1 2001
2 1 1 2002
3 1 1 2003
4 0 2 2004
5 2 2 2005
6 2 2 2006
7 0 3 2007
8 0 3 2008
9 1 3 2009
10 1 3 2010
11 0 4 2011
12 0 4 2012
13 0 4 2013
第一个答案(不考虑第3行)
我们可以在按 assembly
lag
函数
library(dplyr)
df_input %>%
group_by(assembly) %>%
mutate(winner = ifelse(lag(winner, default = 0) > winner, lag(winner), winner))
Groups: assembly [4]
winner assembly year
<dbl> <dbl> <dbl>
1 1 1 2001
2 1 1 2002
3 0 1 2003
4 0 2 2004
5 2 2 2005
6 2 2 2006
7 0 3 2007
8 0 3 2008
9 1 3 2009
10 1 3 2010
11 0 4 2011
12 0 4 2012
13 0 4 2013