在每个 ID 有多个观察值的 df 中,如何根据另一个变量有条件地查找日期?
In a df with multiple observations for each ID, how to conditionally find date according to another variable?
这是我在这里提出的第一个问题,希望能答对!
我有一个包含数百万个观察值的数据集。每一行都是不同人在不同日期拿到的药物处方,每个人在数据框中出现多次。
library(dplyr)
set.seed(42)
ID <- sample(c("ID1", "ID2", "ID3", "ID4", "ID5", "ID6", "ID7", "ID8", "ID9", "ID10"), 40, replace = T)
prescription_date <- sample(seq(as.Date('1999/01/01'), as.Date('2010/01/01'), by="month"), 40)
switch <- sample(c(0, 1), 40, replace = T, prob = c(0.4, 0.6))
df <- data.frame(ID, prescription_date, switch) %>% group_by(ID)
df %>% arrange(ID) %>% print(n=40)
#> # A tibble: 40 x 3
#> # Groups: ID [10]
#> ID prescription_date switch
#> <fct> <date> <dbl>
#> 1 ID1 2007-03-01 1
#> 2 ID1 1999-06-01 0
#> 3 ID1 1999-02-01 1
#> 4 ID1 2006-09-01 0
#> 5 ID10 2008-08-01 0
#> 6 ID10 2000-09-01 1
#> 7 ID10 2001-09-01 1
#> 8 ID10 2001-11-01 1
#> 9 ID10 2000-04-01 1
#> 10 ID10 2004-09-01 1
#> 11 ID2 2008-10-01 1
#> 12 ID2 2003-01-01 0
#> 13 ID2 2005-12-01 0
#> 14 ID2 2000-06-01 0
#> 15 ID3 2007-07-01 1
#> 16 ID3 2007-11-01 0
#> 17 ID4 1999-03-01 1
#> 18 ID4 2003-10-01 0
#> 19 ID4 1999-05-01 1
#> 20 ID4 2007-10-01 1
#> 21 ID4 2005-04-01 0
#> 22 ID4 2009-05-01 1
#> 23 ID4 2005-10-01 0
#> 24 ID4 2003-07-01 0
#> 25 ID5 2008-06-01 1
#> 26 ID5 2002-04-01 1
#> 27 ID5 2005-01-01 0
#> 28 ID5 2001-05-01 0
#> 29 ID5 2009-09-01 1
#> 30 ID6 2006-08-01 0
#> 31 ID6 2000-12-01 0
#> 32 ID7 2007-06-01 0
#> 33 ID8 2008-11-01 1
#> 34 ID8 1999-09-01 0
#> 35 ID8 2007-05-01 0
#> 36 ID8 2009-03-01 1
#> 37 ID9 2009-10-01 0
#> 38 ID9 1999-10-01 1
#> 39 ID9 2007-04-01 0
#> 40 ID9 2008-01-01 0
由 reprex package (v0.3.0)
于 2021-06-19 创建
变量“switch”表示个体是否相对于之前的处方更换了该处方中的药物。我需要知道每个人第三次更换药物的日期。但是,我遇到了困难,因为我似乎无法对每次观察创建变量“开关”的迭代求和。设法创建类似于此的内容就足够了:
#> # A tibble: 40 x 3
#> # Groups: ID [10]
#> ID prescription_date switch date3switch
#> <fct> <date> <dbl> <dbl>
#> 1 ID1 1999-02-01 1 1
#> 2 ID1 1999-06-01 0 NA
#> 3 ID1 2006-09-01 0 NA
#> 4 ID1 2007-03-01 1 2
#> 5 ID10 2000-04-01 1 1
#> 6 ID10 2000-09-01 1 2
#> 7 ID10 2001-09-01 1 3
#> 8 ID10 2001-11-01 1 4
#> 9 ID10 2004-09-01 1 5
#> 10 ID10 2008-08-01 0 NA
#> 11 ID2 2000-06-01 0 NA
#> 12 ID2 2003-01-01 0 NA
#> 13 ID2 2005-12-01 0 NA
#> 14 ID2 2008-10-01 1 1
#> 15 ID3 2007-07-01 1 1
#> 16 ID3 2007-11-01 0 NA
#> 17 ID4 1999-03-01 1 1
#> 18 ID4 1999-05-01 1 2
#> 19 ID4 2003-07-01 0 NA
#> 20 ID4 2003-10-01 0 NA
#> 21 ID4 2005-04-01 0 NA
#> 22 ID4 2005-10-01 0 NA
#> 23 ID4 2007-10-01 1 3
#> 24 ID4 2009-05-01 1 4
我尝试创建一个 for 循环,但我想这对我的初学者来说太高级了,因为我只设法创建了一个 NULL 数据框..
df <- for (i in 1:dim(df)[1]) {
if(sum(data$switch) == 3)
{ mutate(date3switch == prescribed_date)}
else NA
}
由 reprex package (v0.3.0)
于 2021-06-19 创建
感谢您的帮助!
使用 cumsum
会有所帮助,并将 switch = 0
中的值替换为 NA
。
library(dplyr)
df %>%
arrange(ID) %>%
group_by(ID) %>%
mutate(date3switch = cumsum(switch),
date3switch = replace(date3switch, switch == 0, NA)) %>%
ungroup
# ID prescription_date switch date3switch
# <chr> <date> <dbl> <dbl>
# 1 ID1 2007-03-01 1 1
# 2 ID1 1999-06-01 0 NA
# 3 ID1 1999-02-01 1 2
# 4 ID1 2006-09-01 0 NA
# 5 ID10 2008-08-01 0 NA
# 6 ID10 2000-09-01 1 1
# 7 ID10 2001-09-01 1 2
# 8 ID10 2001-11-01 1 3
# 9 ID10 2000-04-01 1 4
#10 ID10 2004-09-01 1 5
# … with 30 more rows
library(dplyr)
set.seed(42)
ID <- sample(c("ID1", "ID2", "ID3", "ID4", "ID5", "ID6", "ID7", "ID8", "ID9", "ID10"), 40, replace = T)
prescription_date <- sample(seq(as.Date('1999/01/01'), as.Date('2010/01/01'), by="month"), 40)
switch <- sample(c(0, 1), 40, replace = T, prob = c(0.4, 0.6))
df <- data.frame(ID, prescription_date, switch) %>% group_by(ID)
df %>% group_by(ID) %>%
arrange(prescription_date, .by_group = T) %>%
mutate(switch2 = ifelse(switch == 0, NA, cumsum(switch))) %>%
print(n = 40)
#> # A tibble: 40 x 4
#> # Groups: ID [10]
#> ID prescription_date switch switch2
#> <chr> <date> <dbl> <dbl>
#> 1 ID1 1999-02-01 1 1
#> 2 ID1 1999-06-01 0 NA
#> 3 ID1 2006-09-01 0 NA
#> 4 ID1 2007-03-01 1 2
#> 5 ID10 2000-04-01 1 1
#> 6 ID10 2000-09-01 1 2
#> 7 ID10 2001-09-01 1 3
#> 8 ID10 2001-11-01 1 4
#> 9 ID10 2004-09-01 1 5
#> 10 ID10 2008-08-01 0 NA
#> 11 ID2 2000-06-01 0 NA
#> 12 ID2 2003-01-01 0 NA
#> 13 ID2 2005-12-01 0 NA
#> 14 ID2 2008-10-01 1 1
#> 15 ID3 2007-07-01 1 1
#> 16 ID3 2007-11-01 0 NA
#> 17 ID4 1999-03-01 1 1
#> 18 ID4 1999-05-01 1 2
#> 19 ID4 2003-07-01 0 NA
#> 20 ID4 2003-10-01 0 NA
#> 21 ID4 2005-04-01 0 NA
#> 22 ID4 2005-10-01 0 NA
#> 23 ID4 2007-10-01 1 3
#> 24 ID4 2009-05-01 1 4
#> 25 ID5 2001-05-01 0 NA
#> 26 ID5 2002-04-01 1 1
#> 27 ID5 2005-01-01 0 NA
#> 28 ID5 2008-06-01 1 2
#> 29 ID5 2009-09-01 1 3
#> 30 ID6 2000-12-01 0 NA
#> 31 ID6 2006-08-01 0 NA
#> 32 ID7 2007-06-01 0 NA
#> 33 ID8 1999-09-01 0 NA
#> 34 ID8 2007-05-01 0 NA
#> 35 ID8 2008-11-01 1 1
#> 36 ID8 2009-03-01 1 2
#> 37 ID9 1999-10-01 1 1
#> 38 ID9 2007-04-01 0 NA
#> 39 ID9 2008-01-01 0 NA
#> 40 ID9 2009-10-01 0 NA
由 reprex package (v2.0.0)
创建于 2021-06-19
我们可以使用na_if
library(dplyr)
df %>%
arrange(ID) %>%
group_by(ID) %>%
mutate(date3switch = na_if(cumsum(switch), 0))
这是我在这里提出的第一个问题,希望能答对!
我有一个包含数百万个观察值的数据集。每一行都是不同人在不同日期拿到的药物处方,每个人在数据框中出现多次。
library(dplyr)
set.seed(42)
ID <- sample(c("ID1", "ID2", "ID3", "ID4", "ID5", "ID6", "ID7", "ID8", "ID9", "ID10"), 40, replace = T)
prescription_date <- sample(seq(as.Date('1999/01/01'), as.Date('2010/01/01'), by="month"), 40)
switch <- sample(c(0, 1), 40, replace = T, prob = c(0.4, 0.6))
df <- data.frame(ID, prescription_date, switch) %>% group_by(ID)
df %>% arrange(ID) %>% print(n=40)
#> # A tibble: 40 x 3
#> # Groups: ID [10]
#> ID prescription_date switch
#> <fct> <date> <dbl>
#> 1 ID1 2007-03-01 1
#> 2 ID1 1999-06-01 0
#> 3 ID1 1999-02-01 1
#> 4 ID1 2006-09-01 0
#> 5 ID10 2008-08-01 0
#> 6 ID10 2000-09-01 1
#> 7 ID10 2001-09-01 1
#> 8 ID10 2001-11-01 1
#> 9 ID10 2000-04-01 1
#> 10 ID10 2004-09-01 1
#> 11 ID2 2008-10-01 1
#> 12 ID2 2003-01-01 0
#> 13 ID2 2005-12-01 0
#> 14 ID2 2000-06-01 0
#> 15 ID3 2007-07-01 1
#> 16 ID3 2007-11-01 0
#> 17 ID4 1999-03-01 1
#> 18 ID4 2003-10-01 0
#> 19 ID4 1999-05-01 1
#> 20 ID4 2007-10-01 1
#> 21 ID4 2005-04-01 0
#> 22 ID4 2009-05-01 1
#> 23 ID4 2005-10-01 0
#> 24 ID4 2003-07-01 0
#> 25 ID5 2008-06-01 1
#> 26 ID5 2002-04-01 1
#> 27 ID5 2005-01-01 0
#> 28 ID5 2001-05-01 0
#> 29 ID5 2009-09-01 1
#> 30 ID6 2006-08-01 0
#> 31 ID6 2000-12-01 0
#> 32 ID7 2007-06-01 0
#> 33 ID8 2008-11-01 1
#> 34 ID8 1999-09-01 0
#> 35 ID8 2007-05-01 0
#> 36 ID8 2009-03-01 1
#> 37 ID9 2009-10-01 0
#> 38 ID9 1999-10-01 1
#> 39 ID9 2007-04-01 0
#> 40 ID9 2008-01-01 0
由 reprex package (v0.3.0)
于 2021-06-19 创建变量“switch”表示个体是否相对于之前的处方更换了该处方中的药物。我需要知道每个人第三次更换药物的日期。但是,我遇到了困难,因为我似乎无法对每次观察创建变量“开关”的迭代求和。设法创建类似于此的内容就足够了:
#> # A tibble: 40 x 3
#> # Groups: ID [10]
#> ID prescription_date switch date3switch
#> <fct> <date> <dbl> <dbl>
#> 1 ID1 1999-02-01 1 1
#> 2 ID1 1999-06-01 0 NA
#> 3 ID1 2006-09-01 0 NA
#> 4 ID1 2007-03-01 1 2
#> 5 ID10 2000-04-01 1 1
#> 6 ID10 2000-09-01 1 2
#> 7 ID10 2001-09-01 1 3
#> 8 ID10 2001-11-01 1 4
#> 9 ID10 2004-09-01 1 5
#> 10 ID10 2008-08-01 0 NA
#> 11 ID2 2000-06-01 0 NA
#> 12 ID2 2003-01-01 0 NA
#> 13 ID2 2005-12-01 0 NA
#> 14 ID2 2008-10-01 1 1
#> 15 ID3 2007-07-01 1 1
#> 16 ID3 2007-11-01 0 NA
#> 17 ID4 1999-03-01 1 1
#> 18 ID4 1999-05-01 1 2
#> 19 ID4 2003-07-01 0 NA
#> 20 ID4 2003-10-01 0 NA
#> 21 ID4 2005-04-01 0 NA
#> 22 ID4 2005-10-01 0 NA
#> 23 ID4 2007-10-01 1 3
#> 24 ID4 2009-05-01 1 4
我尝试创建一个 for 循环,但我想这对我的初学者来说太高级了,因为我只设法创建了一个 NULL 数据框..
df <- for (i in 1:dim(df)[1]) {
if(sum(data$switch) == 3)
{ mutate(date3switch == prescribed_date)}
else NA
}
由 reprex package (v0.3.0)
于 2021-06-19 创建感谢您的帮助!
使用 cumsum
会有所帮助,并将 switch = 0
中的值替换为 NA
。
library(dplyr)
df %>%
arrange(ID) %>%
group_by(ID) %>%
mutate(date3switch = cumsum(switch),
date3switch = replace(date3switch, switch == 0, NA)) %>%
ungroup
# ID prescription_date switch date3switch
# <chr> <date> <dbl> <dbl>
# 1 ID1 2007-03-01 1 1
# 2 ID1 1999-06-01 0 NA
# 3 ID1 1999-02-01 1 2
# 4 ID1 2006-09-01 0 NA
# 5 ID10 2008-08-01 0 NA
# 6 ID10 2000-09-01 1 1
# 7 ID10 2001-09-01 1 2
# 8 ID10 2001-11-01 1 3
# 9 ID10 2000-04-01 1 4
#10 ID10 2004-09-01 1 5
# … with 30 more rows
library(dplyr)
set.seed(42)
ID <- sample(c("ID1", "ID2", "ID3", "ID4", "ID5", "ID6", "ID7", "ID8", "ID9", "ID10"), 40, replace = T)
prescription_date <- sample(seq(as.Date('1999/01/01'), as.Date('2010/01/01'), by="month"), 40)
switch <- sample(c(0, 1), 40, replace = T, prob = c(0.4, 0.6))
df <- data.frame(ID, prescription_date, switch) %>% group_by(ID)
df %>% group_by(ID) %>%
arrange(prescription_date, .by_group = T) %>%
mutate(switch2 = ifelse(switch == 0, NA, cumsum(switch))) %>%
print(n = 40)
#> # A tibble: 40 x 4
#> # Groups: ID [10]
#> ID prescription_date switch switch2
#> <chr> <date> <dbl> <dbl>
#> 1 ID1 1999-02-01 1 1
#> 2 ID1 1999-06-01 0 NA
#> 3 ID1 2006-09-01 0 NA
#> 4 ID1 2007-03-01 1 2
#> 5 ID10 2000-04-01 1 1
#> 6 ID10 2000-09-01 1 2
#> 7 ID10 2001-09-01 1 3
#> 8 ID10 2001-11-01 1 4
#> 9 ID10 2004-09-01 1 5
#> 10 ID10 2008-08-01 0 NA
#> 11 ID2 2000-06-01 0 NA
#> 12 ID2 2003-01-01 0 NA
#> 13 ID2 2005-12-01 0 NA
#> 14 ID2 2008-10-01 1 1
#> 15 ID3 2007-07-01 1 1
#> 16 ID3 2007-11-01 0 NA
#> 17 ID4 1999-03-01 1 1
#> 18 ID4 1999-05-01 1 2
#> 19 ID4 2003-07-01 0 NA
#> 20 ID4 2003-10-01 0 NA
#> 21 ID4 2005-04-01 0 NA
#> 22 ID4 2005-10-01 0 NA
#> 23 ID4 2007-10-01 1 3
#> 24 ID4 2009-05-01 1 4
#> 25 ID5 2001-05-01 0 NA
#> 26 ID5 2002-04-01 1 1
#> 27 ID5 2005-01-01 0 NA
#> 28 ID5 2008-06-01 1 2
#> 29 ID5 2009-09-01 1 3
#> 30 ID6 2000-12-01 0 NA
#> 31 ID6 2006-08-01 0 NA
#> 32 ID7 2007-06-01 0 NA
#> 33 ID8 1999-09-01 0 NA
#> 34 ID8 2007-05-01 0 NA
#> 35 ID8 2008-11-01 1 1
#> 36 ID8 2009-03-01 1 2
#> 37 ID9 1999-10-01 1 1
#> 38 ID9 2007-04-01 0 NA
#> 39 ID9 2008-01-01 0 NA
#> 40 ID9 2009-10-01 0 NA
由 reprex package (v2.0.0)
创建于 2021-06-19我们可以使用na_if
library(dplyr)
df %>%
arrange(ID) %>%
group_by(ID) %>%
mutate(date3switch = na_if(cumsum(switch), 0))