从不同变量匹配的行的另一列获取数据
Getting data from another column from a row where a different variable matches
最小工作示例:
library(dplyr)
df = data.frame(group_id = c("G1","G1", rep("G2",8)),
prod_id = c(1,2,5,6,7,8,9,10,11,12),
prod_type = rep(c("a","a", "b", "c","d"),2),
start = lubridate::dmy(c("01/01/2001", "02/02/2002",
"05/05/2005","06/06/2006", "07/07/2007", "08/08/2008", "09/09/2009","01/01/2010", "02/02/2011", "03/03/2012" )))%>%
group_by(group_id) %>%
mutate(next_acd = if_else(lead(prod_type) %in% c("a","c","d"), lead(prod_id), prod_id[NA])) %>%
tidyr::fill(next_acd, .direction = "up")
df:
group_id prod_id prod_type start next_acd
<chr> <dbl> <chr> <date> <dbl>
1 G1 1 a 2001-01-01 2
2 G1 2 a 2002-02-02 NA
3 G2 5 b 2005-05-05 6
4 G2 6 c 2006-06-06 7
5 G2 7 d 2007-07-07 8
6 G2 8 a 2008-08-08 9
7 G2 9 a 2009-09-09 11
8 G2 10 b 2010-01-01 11
9 G2 11 c 2011-02-02 12
10 G2 12 d 2012-03-03 NA
其中 next_acd
是该组中的下一个 prod_id
,即 prod_type
“a”、“c”或“d”。
我现在想创建一个新列 fin
,它是下一个 prod_type
“a”、“c”或“d”的 start
日期。 IE。我想匹配 next_acd
到 prod_id
并得到相应的 start
.
我试过另一个问题给出的答案
df$fin <- df$start[df$prod_id[df$next_acd]]
df:
group_id prod_id prod_type start next_acd fin
<chr> <dbl> <chr> <date> <dbl> <date>
1 G1 1 a 2001-01-01 2 2002-02-02
2 G1 2 a 2002-02-02 NA NA
3 G2 5 b 2005-05-05 6 2010-01-01
4 G2 6 c 2006-06-06 7 2011-02-02
5 G2 7 d 2007-07-07 8 2012-03-03
6 G2 8 a 2008-08-08 9 NA
7 G2 9 a 2009-09-09 11 NA
8 G2 10 b 2010-01-01 11 NA
9 G2 11 c 2011-02-02 12 NA
10 G2 12 d 2012-03-03 NA NA
我不确定这里发生了什么。 fin
仅在第 1 行正确输入。
输出应该是这样的:
group_id prod_id prod_type start fin next_acd
<chr> <dbl> <chr> <date> <date> <dbl>
1 G1 1 a 2001-01-01 2002-02-02 2
2 G1 2 a 2002-02-02 NA NA
3 G2 5 b 2005-05-05 2006-06-06 6
4 G2 6 c 2006-06-06 2007-07-07 7
5 G2 7 d 2007-07-07 2008-08-08 8
6 G2 8 a 2008-08-08 2009-09-09 9
7 G2 9 a 2009-09-09 2011-02-02 11
8 G2 10 b 2010-01-01 2011-02-02 11
9 G2 11 c 2011-02-02 2012-03-03 12
10 G2 12 d 2012-03-03 NA NA
根据使用 merge
的建议:
df1= df[,c("prod_id","start")]%>%rename(fin = start)
df2= df[,c("group_id", "prod_type","prod_id", "start","next_acd")]
left_join(df2, df1, by= c("next_acd"="prod_id"))
group_id prod_type prod_id start next_acd fin
<chr> <chr> <dbl> <date> <dbl> <date>
1 G1 a 1 2001-01-01 2 2002-02-02
2 G1 a 2 2002-02-02 NA NA
3 G2 b 5 2005-05-05 6 2006-06-06
4 G2 c 6 2006-06-06 7 2007-07-07
5 G2 d 7 2007-07-07 8 2008-08-08
6 G2 a 8 2008-08-08 9 2009-09-09
7 G2 a 9 2009-09-09 11 2011-02-02
8 G2 b 10 2010-01-01 11 2011-02-02
9 G2 c 11 2011-02-02 12 2012-03-03
10 G2 d 12 2012-03-03 NA NA
一个 tidyverse 选项:
library(tidyverse)
df <- data.frame(
group_id = c("G1", "G1", rep("G2", 8)),
prod_id = c(1, 2, 5, 6, 7, 8, 9, 10, 11, 12),
prod_type = rep(c("a", "a", "b", "c", "d"), 2),
start = lubridate::dmy(c(
"01/01/2001", "02/02/2002",
"05/05/2005", "06/06/2006", "07/07/2007", "08/08/2008", "09/09/2009", "01/01/2010", "02/02/2011", "03/03/2012"
))
) %>%
group_by(group_id) %>%
mutate(next_acd = if_else(lead(prod_type) %in% c("a", "c", "d"), lead(prod_id), prod_id[NA])) %>%
fill(next_acd, .direction = "up")
df %>%
group_by(group_id) %>%
mutate(fin = start[match(next_acd, prod_id, nomatch = NA_real_)])
#> # A tibble: 10 × 6
#> # Groups: group_id [2]
#> group_id prod_id prod_type start next_acd fin
#> <chr> <dbl> <chr> <date> <dbl> <date>
#> 1 G1 1 a 2001-01-01 2 2002-02-02
#> 2 G1 2 a 2002-02-02 NA NA
#> 3 G2 5 b 2005-05-05 6 2006-06-06
#> 4 G2 6 c 2006-06-06 7 2007-07-07
#> 5 G2 7 d 2007-07-07 8 2008-08-08
#> 6 G2 8 a 2008-08-08 9 2009-09-09
#> 7 G2 9 a 2009-09-09 11 2011-02-02
#> 8 G2 10 b 2010-01-01 11 2011-02-02
#> 9 G2 11 c 2011-02-02 12 2012-03-03
#> 10 G2 12 d 2012-03-03 NA NA
由 reprex package (v2.0.1)
于 2022-05-06 创建
最小工作示例:
library(dplyr)
df = data.frame(group_id = c("G1","G1", rep("G2",8)),
prod_id = c(1,2,5,6,7,8,9,10,11,12),
prod_type = rep(c("a","a", "b", "c","d"),2),
start = lubridate::dmy(c("01/01/2001", "02/02/2002",
"05/05/2005","06/06/2006", "07/07/2007", "08/08/2008", "09/09/2009","01/01/2010", "02/02/2011", "03/03/2012" )))%>%
group_by(group_id) %>%
mutate(next_acd = if_else(lead(prod_type) %in% c("a","c","d"), lead(prod_id), prod_id[NA])) %>%
tidyr::fill(next_acd, .direction = "up")
df:
group_id prod_id prod_type start next_acd
<chr> <dbl> <chr> <date> <dbl>
1 G1 1 a 2001-01-01 2
2 G1 2 a 2002-02-02 NA
3 G2 5 b 2005-05-05 6
4 G2 6 c 2006-06-06 7
5 G2 7 d 2007-07-07 8
6 G2 8 a 2008-08-08 9
7 G2 9 a 2009-09-09 11
8 G2 10 b 2010-01-01 11
9 G2 11 c 2011-02-02 12
10 G2 12 d 2012-03-03 NA
其中 next_acd
是该组中的下一个 prod_id
,即 prod_type
“a”、“c”或“d”。
我现在想创建一个新列 fin
,它是下一个 prod_type
“a”、“c”或“d”的 start
日期。 IE。我想匹配 next_acd
到 prod_id
并得到相应的 start
.
我试过另一个问题给出的答案
df$fin <- df$start[df$prod_id[df$next_acd]]
df:
group_id prod_id prod_type start next_acd fin
<chr> <dbl> <chr> <date> <dbl> <date>
1 G1 1 a 2001-01-01 2 2002-02-02
2 G1 2 a 2002-02-02 NA NA
3 G2 5 b 2005-05-05 6 2010-01-01
4 G2 6 c 2006-06-06 7 2011-02-02
5 G2 7 d 2007-07-07 8 2012-03-03
6 G2 8 a 2008-08-08 9 NA
7 G2 9 a 2009-09-09 11 NA
8 G2 10 b 2010-01-01 11 NA
9 G2 11 c 2011-02-02 12 NA
10 G2 12 d 2012-03-03 NA NA
我不确定这里发生了什么。 fin
仅在第 1 行正确输入。
输出应该是这样的:
group_id prod_id prod_type start fin next_acd
<chr> <dbl> <chr> <date> <date> <dbl>
1 G1 1 a 2001-01-01 2002-02-02 2
2 G1 2 a 2002-02-02 NA NA
3 G2 5 b 2005-05-05 2006-06-06 6
4 G2 6 c 2006-06-06 2007-07-07 7
5 G2 7 d 2007-07-07 2008-08-08 8
6 G2 8 a 2008-08-08 2009-09-09 9
7 G2 9 a 2009-09-09 2011-02-02 11
8 G2 10 b 2010-01-01 2011-02-02 11
9 G2 11 c 2011-02-02 2012-03-03 12
10 G2 12 d 2012-03-03 NA NA
根据使用 merge
的建议:
df1= df[,c("prod_id","start")]%>%rename(fin = start)
df2= df[,c("group_id", "prod_type","prod_id", "start","next_acd")]
left_join(df2, df1, by= c("next_acd"="prod_id"))
group_id prod_type prod_id start next_acd fin
<chr> <chr> <dbl> <date> <dbl> <date>
1 G1 a 1 2001-01-01 2 2002-02-02
2 G1 a 2 2002-02-02 NA NA
3 G2 b 5 2005-05-05 6 2006-06-06
4 G2 c 6 2006-06-06 7 2007-07-07
5 G2 d 7 2007-07-07 8 2008-08-08
6 G2 a 8 2008-08-08 9 2009-09-09
7 G2 a 9 2009-09-09 11 2011-02-02
8 G2 b 10 2010-01-01 11 2011-02-02
9 G2 c 11 2011-02-02 12 2012-03-03
10 G2 d 12 2012-03-03 NA NA
一个 tidyverse 选项:
library(tidyverse)
df <- data.frame(
group_id = c("G1", "G1", rep("G2", 8)),
prod_id = c(1, 2, 5, 6, 7, 8, 9, 10, 11, 12),
prod_type = rep(c("a", "a", "b", "c", "d"), 2),
start = lubridate::dmy(c(
"01/01/2001", "02/02/2002",
"05/05/2005", "06/06/2006", "07/07/2007", "08/08/2008", "09/09/2009", "01/01/2010", "02/02/2011", "03/03/2012"
))
) %>%
group_by(group_id) %>%
mutate(next_acd = if_else(lead(prod_type) %in% c("a", "c", "d"), lead(prod_id), prod_id[NA])) %>%
fill(next_acd, .direction = "up")
df %>%
group_by(group_id) %>%
mutate(fin = start[match(next_acd, prod_id, nomatch = NA_real_)])
#> # A tibble: 10 × 6
#> # Groups: group_id [2]
#> group_id prod_id prod_type start next_acd fin
#> <chr> <dbl> <chr> <date> <dbl> <date>
#> 1 G1 1 a 2001-01-01 2 2002-02-02
#> 2 G1 2 a 2002-02-02 NA NA
#> 3 G2 5 b 2005-05-05 6 2006-06-06
#> 4 G2 6 c 2006-06-06 7 2007-07-07
#> 5 G2 7 d 2007-07-07 8 2008-08-08
#> 6 G2 8 a 2008-08-08 9 2009-09-09
#> 7 G2 9 a 2009-09-09 11 2011-02-02
#> 8 G2 10 b 2010-01-01 11 2011-02-02
#> 9 G2 11 c 2011-02-02 12 2012-03-03
#> 10 G2 12 d 2012-03-03 NA NA
由 reprex package (v2.0.1)
于 2022-05-06 创建