使用 dplyr 框架改变多列
Mutate multiple columns using the dplyr framework
我有一个数据框 apcd_hud_ex。我想取一些列名(例如 x2014_03_15),并根据列的当前值、列名中的解析日期和数据框中的另一列更改列的值(SMOKEFREE_DATE).我可以在列上循环执行此操作,但我真的很想知道如何使用 dplyr 和 mutate 执行此操作。任何帮助将不胜感激!
apcd_hud_ex = structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(16283,
16283, 16071, 16071, 16648), class = "Date"), x2014_03_15 = c(1,
1, 1, 0, 1), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(1,
1, 1, 1, 1), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(1,
1, 1, 1, 1), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(1,
1, 1, 1, 1), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(1,
1, 1, 1, 1), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(1,
1, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
> apcd_hud_ex
# A tibble: 5 x 13
studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
<int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2014-08-01 1 1 1 1 1 1 1 1
2 2 2014-08-01 1 1 1 1 1 1 1 1
3 3 2014-01-01 1 1 1 1 1 1 1 1
4 4 2014-01-01 0 1 1 1 1 1 1 1
5 5 2015-08-01 1 1 1 1 1 1 1 1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>
>
#function for loop
assign_PHRes_enrollIns_fn <- function(SFdate,insValue,insDate){
val = if_else(insValue == 0,
0,
if_else(as.Date(insDate) < as.Date(SFdate,"%Y-%m-%d"),
1,
2))
return(val)
}
#vectorized function
assign_PHRes_enrollIns_fn_vec <- Vectorize(assign_PHRes_enrollIns_fn)
dateCols = names(apcd_hud_ex)[which(names(apcd_hud_ex) == "x2014_03_15"):which(names(apcd_hud_ex) == "x2015_01_15")]
这个对列名 (dateCols) 的循环有效:
for(i in 1:length(dateCols)){
dateCol = dateCols[i]
insDate = as.Date(paste0(str_sub(dateCol,2,5),"/",str_sub(dateCol,7,8),"/",str_sub(dateCol,10,11)),"%Y/%m/%d")
apcd_hud_ex[,dateCol] = assign_PHRes_enrollIns_fn_vec(apcd_hud_ex[,"SMOKEFREE_DATE"],apcd_hud_ex[,dateCol],insDate)
}
现在操作的数据框看起来像这样,这就是我想要的:
> apcd_hud_ex
# A tibble: 5 x 13
studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
<int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2014-08-01 1 1 1 1 1 2 2 2
2 2 2014-08-01 1 1 1 1 1 2 2 2
3 3 2014-01-01 2 2 2 2 2 2 2 2
4 4 2014-01-01 0 2 2 2 2 2 2 2
5 5 2015-08-01 1 1 1 1 1 1 1 1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>
但是,我想学习如何使用动态规划和 dplyr 来做到这一点。我尝试了 2 个函数:
newInsValCols_fn1 <- function(df,dateCols){
insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")
df1 <- df %>%
mutate({{dateCols}} := if_else({{dateCols}} == 0,
0,
if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
1,
2)))
return(df1)
}
newInsValCols_fn1(apcd_hud_ex,dateCols)
给出错误:
Error: The LHS of `:=` must be a string or a symbol
所以我尝试使用符号:
newInsValCols_fn2 <- function(df,dateCols){
dateCols_syms = syms(dateCols)
insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")
df1 <- df %>%
mutate(!!dateCols_syms := if_else({{dateCols}} == 0,
0,
if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
1,
2)))
return(df1)
}
newInsValCols_fn2(apcd_hud_ex,dateCols)
给出同样的错误:
Error: The LHS of `:=` must be a string or a symbol
我也试过用!!!而不是 !!,但这导致了以下错误:
Error: The LHS of `:=` can't be spliced with `!!!`
我的理解有些欠缺。
这是我使用 dplyr
的方法。
library(dplyr)
library(lubridate)
apcd_hud_ex %>%
mutate(across(
starts_with('x'),
~ case_when(. == 0 ~ 0,
ymd(gsub('x', '', cur_column())) < SMOKEFREE_DATE ~ 1,
TRUE ~ 2)
))
#> # A tibble: 5 x 13
#> studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15 x2014_11_15 x2014_12_15 x2015_01_15
#> <int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2014-08-01 1 1 1 1 1 2 2 2 2 2 2
#> 2 2 2014-08-01 1 1 1 1 1 2 2 2 2 2 2
#> 3 3 2014-01-01 2 2 2 2 2 2 2 2 2 2 2
#> 4 4 2014-01-01 0 2 2 2 2 2 2 2 2 2 2
#> 5 5 2015-08-01 1 1 1 1 1 1 1 1 1 1 1
您可以使用 pivot_longer
只修改一列,这是 mutate(across())
的替代方法。
你可以使用case_when
有多个条件,所以你不需要嵌套多个if语句。该值将是第一个 true 语句中的一个。
library(tidyverse)
apcd_hud_ex <- structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(
16283,
16283, 16071, 16071, 16648
), class = "Date"), x2014_03_15 = c(
1,
1, 1, 0, 1
), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(
1,
1, 1, 1, 1
), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(
1,
1, 1, 1, 1
), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(
1,
1, 1, 1, 1
), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(
1,
1, 1, 1, 1
), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(
1,
1, 1, 1, 1
)), row.names = c(NA, -5L), class = c(
"tbl_df", "tbl",
"data.frame"
))
apcd_hud_ex %>%
pivot_longer(starts_with("x")) %>%
mutate(
insDate = name %>% str_remove("^x") %>% str_replace_all("_", "-") %>% as.Date(),
value = case_when(
value == 0 ~ 0,
insDate < SMOKEFREE_DATE ~ 1,
insDate >= SMOKEFREE_DATE ~ 2
)
) %>%
select(-insDate) %>%
pivot_wider()
#> # A tibble: 5 × 13
#> studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15
#> <int> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2014-08-01 1 1 1 1
#> 2 2 2014-08-01 1 1 1 1
#> 3 3 2014-01-01 2 2 2 2
#> 4 4 2014-01-01 0 2 2 2
#> 5 5 2015-08-01 1 1 1 1
#> # … with 7 more variables: x2014_07_15 <dbl>, x2014_08_15 <dbl>,
#> # x2014_09_15 <dbl>, x2014_10_15 <dbl>, x2014_11_15 <dbl>, x2014_12_15 <dbl>,
#> # x2015_01_15 <dbl>
由 reprex package (v2.0.0)
于 2022-05-05 创建
我有一个数据框 apcd_hud_ex。我想取一些列名(例如 x2014_03_15),并根据列的当前值、列名中的解析日期和数据框中的另一列更改列的值(SMOKEFREE_DATE).我可以在列上循环执行此操作,但我真的很想知道如何使用 dplyr 和 mutate 执行此操作。任何帮助将不胜感激!
apcd_hud_ex = structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(16283,
16283, 16071, 16071, 16648), class = "Date"), x2014_03_15 = c(1,
1, 1, 0, 1), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(1,
1, 1, 1, 1), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(1,
1, 1, 1, 1), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(1,
1, 1, 1, 1), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(1,
1, 1, 1, 1), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(1,
1, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
> apcd_hud_ex
# A tibble: 5 x 13
studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
<int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2014-08-01 1 1 1 1 1 1 1 1
2 2 2014-08-01 1 1 1 1 1 1 1 1
3 3 2014-01-01 1 1 1 1 1 1 1 1
4 4 2014-01-01 0 1 1 1 1 1 1 1
5 5 2015-08-01 1 1 1 1 1 1 1 1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>
>
#function for loop
assign_PHRes_enrollIns_fn <- function(SFdate,insValue,insDate){
val = if_else(insValue == 0,
0,
if_else(as.Date(insDate) < as.Date(SFdate,"%Y-%m-%d"),
1,
2))
return(val)
}
#vectorized function
assign_PHRes_enrollIns_fn_vec <- Vectorize(assign_PHRes_enrollIns_fn)
dateCols = names(apcd_hud_ex)[which(names(apcd_hud_ex) == "x2014_03_15"):which(names(apcd_hud_ex) == "x2015_01_15")]
这个对列名 (dateCols) 的循环有效:
for(i in 1:length(dateCols)){
dateCol = dateCols[i]
insDate = as.Date(paste0(str_sub(dateCol,2,5),"/",str_sub(dateCol,7,8),"/",str_sub(dateCol,10,11)),"%Y/%m/%d")
apcd_hud_ex[,dateCol] = assign_PHRes_enrollIns_fn_vec(apcd_hud_ex[,"SMOKEFREE_DATE"],apcd_hud_ex[,dateCol],insDate)
}
现在操作的数据框看起来像这样,这就是我想要的:
> apcd_hud_ex
# A tibble: 5 x 13
studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
<int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2014-08-01 1 1 1 1 1 2 2 2
2 2 2014-08-01 1 1 1 1 1 2 2 2
3 3 2014-01-01 2 2 2 2 2 2 2 2
4 4 2014-01-01 0 2 2 2 2 2 2 2
5 5 2015-08-01 1 1 1 1 1 1 1 1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>
但是,我想学习如何使用动态规划和 dplyr 来做到这一点。我尝试了 2 个函数:
newInsValCols_fn1 <- function(df,dateCols){
insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")
df1 <- df %>%
mutate({{dateCols}} := if_else({{dateCols}} == 0,
0,
if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
1,
2)))
return(df1)
}
newInsValCols_fn1(apcd_hud_ex,dateCols)
给出错误:
Error: The LHS of `:=` must be a string or a symbol
所以我尝试使用符号:
newInsValCols_fn2 <- function(df,dateCols){
dateCols_syms = syms(dateCols)
insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")
df1 <- df %>%
mutate(!!dateCols_syms := if_else({{dateCols}} == 0,
0,
if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
1,
2)))
return(df1)
}
newInsValCols_fn2(apcd_hud_ex,dateCols)
给出同样的错误:
Error: The LHS of `:=` must be a string or a symbol
我也试过用!!!而不是 !!,但这导致了以下错误:
Error: The LHS of `:=` can't be spliced with `!!!`
我的理解有些欠缺。
这是我使用 dplyr
的方法。
library(dplyr)
library(lubridate)
apcd_hud_ex %>%
mutate(across(
starts_with('x'),
~ case_when(. == 0 ~ 0,
ymd(gsub('x', '', cur_column())) < SMOKEFREE_DATE ~ 1,
TRUE ~ 2)
))
#> # A tibble: 5 x 13
#> studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15 x2014_11_15 x2014_12_15 x2015_01_15
#> <int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2014-08-01 1 1 1 1 1 2 2 2 2 2 2
#> 2 2 2014-08-01 1 1 1 1 1 2 2 2 2 2 2
#> 3 3 2014-01-01 2 2 2 2 2 2 2 2 2 2 2
#> 4 4 2014-01-01 0 2 2 2 2 2 2 2 2 2 2
#> 5 5 2015-08-01 1 1 1 1 1 1 1 1 1 1 1
您可以使用 pivot_longer
只修改一列,这是 mutate(across())
的替代方法。
你可以使用case_when
有多个条件,所以你不需要嵌套多个if语句。该值将是第一个 true 语句中的一个。
library(tidyverse)
apcd_hud_ex <- structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(
16283,
16283, 16071, 16071, 16648
), class = "Date"), x2014_03_15 = c(
1,
1, 1, 0, 1
), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(
1,
1, 1, 1, 1
), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(
1,
1, 1, 1, 1
), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(
1,
1, 1, 1, 1
), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(
1,
1, 1, 1, 1
), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(
1,
1, 1, 1, 1
)), row.names = c(NA, -5L), class = c(
"tbl_df", "tbl",
"data.frame"
))
apcd_hud_ex %>%
pivot_longer(starts_with("x")) %>%
mutate(
insDate = name %>% str_remove("^x") %>% str_replace_all("_", "-") %>% as.Date(),
value = case_when(
value == 0 ~ 0,
insDate < SMOKEFREE_DATE ~ 1,
insDate >= SMOKEFREE_DATE ~ 2
)
) %>%
select(-insDate) %>%
pivot_wider()
#> # A tibble: 5 × 13
#> studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15
#> <int> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2014-08-01 1 1 1 1
#> 2 2 2014-08-01 1 1 1 1
#> 3 3 2014-01-01 2 2 2 2
#> 4 4 2014-01-01 0 2 2 2
#> 5 5 2015-08-01 1 1 1 1
#> # … with 7 more variables: x2014_07_15 <dbl>, x2014_08_15 <dbl>,
#> # x2014_09_15 <dbl>, x2014_10_15 <dbl>, x2014_11_15 <dbl>, x2014_12_15 <dbl>,
#> # x2015_01_15 <dbl>
由 reprex package (v2.0.0)
于 2022-05-05 创建