使用 dplyr,根据 dplyr 中每组其他列中的条件添加列,数据为长格式
Add column based on condition in other column per group in dplyr with data in a long format, using dplyr
我有 3 位患者重复测量(4 或 5 次)的长格式数据:
library(dplyr)
library(magrittr)
questiondata <- structure(list(ID = c(2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4), time = c("time1", "time2", "time3", "time4", "time1", "time2",
"time3", "time4", "time5", "time1", "time2", "time3", "time4",
"time5"), drug_use = structure(c(NA, 1L, NA, NA, NA, 2L, NA,
NA, NA, NA, 1L, NA, NA, NA), .Label = c("no", "yes"), class = "factor")), row.names = c(NA,
-14L), class = c("tbl_df", "tbl", "data.frame"))
# Corresponding to the following tibble:
# A tibble: 14 x 3
ID time drug_use
<dbl> <chr> <fct>
1 2 time1 NA
2 2 time2 no
3 2 time3 NA
4 2 time4 NA
5 3 time1 NA
6 3 time2 yes
7 3 time3 NA
8 3 time4 NA
9 3 time5 NA
10 4 time1 NA
11 4 time2 no
12 4 time3 NA
13 4 time4 NA
14 4 time5 NA
我正在尝试创建一个关于药物使用的新专栏(名为 'drug_use_2
),上面写着 'yes' 或 'no' 每个患者 ,基于该患者是否使用过任何药物 time
。所以期望的结果看起来像这样:
A tibble: 14 x 4
ID time drug_use drug_use2
<dbl> <chr> <fct> <fct>
1 2 time1 NA no
2 2 time2 no no
3 2 time3 NA no
4 2 time4 NA no
5 3 time1 NA yes
6 3 time2 yes yes
7 3 time3 NA yes
8 3 time4 NA yes
9 3 time5 NA yes
10 4 time1 NA no
11 4 time2 no no
12 4 time3 NA no
13 4 time4 NA no
14 4 time5 NA no
我曾尝试在 drug_use 上对 PXE 编号 and/or 进行分组,然后将 mutate
与 case when
一起使用,但这会卡在 [=30] 的缺失值上=]:
questiondata <- questiondata %>%
group_by(ID) %>%
mutate(drug_use2=
case_when(
drug_use=="yes" ~ "yes",
drug_use=="no" ~ "no",
TRUE ~ "missing"))
# A tibble: 14 x 4
# Groups: ID [3]
ID time drug_use drug_use2
<dbl> <chr> <fct> <chr>
1 2 time1 NA missing
2 2 time2 no no
3 2 time3 NA missing
4 2 time4 NA missing
5 3 time1 NA missing
6 3 time2 yes yes
7 3 time3 NA missing
8 3 time4 NA missing
9 3 time5 NA missing
10 4 time1 NA missing
11 4 time2 no no
12 4 time3 NA missing
13 4 time4 NA missing
14 4 time5 NA missing
我是不是用错了group_by?提前致谢
使用any
questiondata %>%
group_by(ID) %>%
mutate(druguse2 = case_when(any(drug_use == 'yes') ~ 'yes',
TRUE ~ 'no'))
# A tibble: 14 x 4
# Groups: ID [3]
ID time drug_use druguse2
<dbl> <chr> <fct> <chr>
1 2 time1 NA no
2 2 time2 no no
3 2 time3 NA no
4 2 time4 NA no
5 3 time1 NA yes
6 3 time2 yes yes
7 3 time3 NA yes
8 3 time4 NA yes
9 3 time5 NA yes
10 4 time1 NA no
11 4 time2 no no
12 4 time3 NA no
13 4 time4 NA no
14 4 time5 NA no
我们这里也可以用fill
library(dplyr)
library(tidyr)
questiondata %>%
mutate(drug_use2 = drug_use) %>%
group_by(ID) %>%
fill(drug_use2, .direction = 'updown') %>%
ungroup
# A tibble: 14 x 4
ID time drug_use drug_use2
<dbl> <chr> <fct> <fct>
1 2 time1 <NA> no
2 2 time2 no no
3 2 time3 <NA> no
4 2 time4 <NA> no
5 3 time1 <NA> yes
6 3 time2 yes yes
7 3 time3 <NA> yes
8 3 time4 <NA> yes
9 3 time5 <NA> yes
10 4 time1 <NA> no
11 4 time2 no no
12 4 time3 <NA> no
13 4 time4 <NA> no
14 4 time5 <NA> no
或 coalesce
的另一个选项
questiondata %>%
group_by(ID) %>%
mutate(drug_use2 = coalesce(drug_use[which(drug_use %in%
'yes')[1]], "no")) %>%
ungroup
# A tibble: 14 x 4
ID time drug_use drug_use2
<dbl> <chr> <fct> <chr>
1 2 time1 <NA> no
2 2 time2 no no
3 2 time3 <NA> no
4 2 time4 <NA> no
5 3 time1 <NA> yes
6 3 time2 yes yes
7 3 time3 <NA> yes
8 3 time4 <NA> yes
9 3 time5 <NA> yes
10 4 time1 <NA> no
11 4 time2 no no
12 4 time3 <NA> no
13 4 time4 <NA> no
14 4 time5 <NA> no
或者不分组也可以这样做
questiondata %>%
mutate(drug_use2 = c("no", "yes")[1 + (ID %in% ID[drug_use == 'yes'])])
# A tibble: 14 x 4
ID time drug_use drug_use2
<dbl> <chr> <fct> <chr>
1 2 time1 <NA> no
2 2 time2 no no
3 2 time3 <NA> no
4 2 time4 <NA> no
5 3 time1 <NA> yes
6 3 time2 yes yes
7 3 time3 <NA> yes
8 3 time4 <NA> yes
9 3 time5 <NA> yes
10 4 time1 <NA> no
11 4 time2 no no
12 4 time3 <NA> no
13 4 time4 <NA> no
14 4 time5 <NA> no
类似于base R
transform(questiondata, drug_use2 = c("no", "yes")[1 +
(ID %in% ID[drug_use == 'yes'])])
我有 3 位患者重复测量(4 或 5 次)的长格式数据:
library(dplyr)
library(magrittr)
questiondata <- structure(list(ID = c(2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4), time = c("time1", "time2", "time3", "time4", "time1", "time2",
"time3", "time4", "time5", "time1", "time2", "time3", "time4",
"time5"), drug_use = structure(c(NA, 1L, NA, NA, NA, 2L, NA,
NA, NA, NA, 1L, NA, NA, NA), .Label = c("no", "yes"), class = "factor")), row.names = c(NA,
-14L), class = c("tbl_df", "tbl", "data.frame"))
# Corresponding to the following tibble:
# A tibble: 14 x 3
ID time drug_use
<dbl> <chr> <fct>
1 2 time1 NA
2 2 time2 no
3 2 time3 NA
4 2 time4 NA
5 3 time1 NA
6 3 time2 yes
7 3 time3 NA
8 3 time4 NA
9 3 time5 NA
10 4 time1 NA
11 4 time2 no
12 4 time3 NA
13 4 time4 NA
14 4 time5 NA
我正在尝试创建一个关于药物使用的新专栏(名为 'drug_use_2
),上面写着 'yes' 或 'no' 每个患者 ,基于该患者是否使用过任何药物 time
。所以期望的结果看起来像这样:
A tibble: 14 x 4
ID time drug_use drug_use2
<dbl> <chr> <fct> <fct>
1 2 time1 NA no
2 2 time2 no no
3 2 time3 NA no
4 2 time4 NA no
5 3 time1 NA yes
6 3 time2 yes yes
7 3 time3 NA yes
8 3 time4 NA yes
9 3 time5 NA yes
10 4 time1 NA no
11 4 time2 no no
12 4 time3 NA no
13 4 time4 NA no
14 4 time5 NA no
我曾尝试在 drug_use 上对 PXE 编号 and/or 进行分组,然后将 mutate
与 case when
一起使用,但这会卡在 [=30] 的缺失值上=]:
questiondata <- questiondata %>%
group_by(ID) %>%
mutate(drug_use2=
case_when(
drug_use=="yes" ~ "yes",
drug_use=="no" ~ "no",
TRUE ~ "missing"))
# A tibble: 14 x 4
# Groups: ID [3]
ID time drug_use drug_use2
<dbl> <chr> <fct> <chr>
1 2 time1 NA missing
2 2 time2 no no
3 2 time3 NA missing
4 2 time4 NA missing
5 3 time1 NA missing
6 3 time2 yes yes
7 3 time3 NA missing
8 3 time4 NA missing
9 3 time5 NA missing
10 4 time1 NA missing
11 4 time2 no no
12 4 time3 NA missing
13 4 time4 NA missing
14 4 time5 NA missing
我是不是用错了group_by?提前致谢
使用any
questiondata %>%
group_by(ID) %>%
mutate(druguse2 = case_when(any(drug_use == 'yes') ~ 'yes',
TRUE ~ 'no'))
# A tibble: 14 x 4
# Groups: ID [3]
ID time drug_use druguse2
<dbl> <chr> <fct> <chr>
1 2 time1 NA no
2 2 time2 no no
3 2 time3 NA no
4 2 time4 NA no
5 3 time1 NA yes
6 3 time2 yes yes
7 3 time3 NA yes
8 3 time4 NA yes
9 3 time5 NA yes
10 4 time1 NA no
11 4 time2 no no
12 4 time3 NA no
13 4 time4 NA no
14 4 time5 NA no
我们这里也可以用fill
library(dplyr)
library(tidyr)
questiondata %>%
mutate(drug_use2 = drug_use) %>%
group_by(ID) %>%
fill(drug_use2, .direction = 'updown') %>%
ungroup
# A tibble: 14 x 4
ID time drug_use drug_use2
<dbl> <chr> <fct> <fct>
1 2 time1 <NA> no
2 2 time2 no no
3 2 time3 <NA> no
4 2 time4 <NA> no
5 3 time1 <NA> yes
6 3 time2 yes yes
7 3 time3 <NA> yes
8 3 time4 <NA> yes
9 3 time5 <NA> yes
10 4 time1 <NA> no
11 4 time2 no no
12 4 time3 <NA> no
13 4 time4 <NA> no
14 4 time5 <NA> no
或 coalesce
questiondata %>%
group_by(ID) %>%
mutate(drug_use2 = coalesce(drug_use[which(drug_use %in%
'yes')[1]], "no")) %>%
ungroup
# A tibble: 14 x 4
ID time drug_use drug_use2
<dbl> <chr> <fct> <chr>
1 2 time1 <NA> no
2 2 time2 no no
3 2 time3 <NA> no
4 2 time4 <NA> no
5 3 time1 <NA> yes
6 3 time2 yes yes
7 3 time3 <NA> yes
8 3 time4 <NA> yes
9 3 time5 <NA> yes
10 4 time1 <NA> no
11 4 time2 no no
12 4 time3 <NA> no
13 4 time4 <NA> no
14 4 time5 <NA> no
或者不分组也可以这样做
questiondata %>%
mutate(drug_use2 = c("no", "yes")[1 + (ID %in% ID[drug_use == 'yes'])])
# A tibble: 14 x 4
ID time drug_use drug_use2
<dbl> <chr> <fct> <chr>
1 2 time1 <NA> no
2 2 time2 no no
3 2 time3 <NA> no
4 2 time4 <NA> no
5 3 time1 <NA> yes
6 3 time2 yes yes
7 3 time3 <NA> yes
8 3 time4 <NA> yes
9 3 time5 <NA> yes
10 4 time1 <NA> no
11 4 time2 no no
12 4 time3 <NA> no
13 4 time4 <NA> no
14 4 time5 <NA> no
类似于base R
transform(questiondata, drug_use2 = c("no", "yes")[1 +
(ID %in% ID[drug_use == 'yes'])])