使用 dplyr 框架改变多列

Question

我有一个数据框 apcd_hud_ex。我想取一些列名（例如 x2014_03_15），并根据列的当前值、列名中的解析日期和数据框中的另一列更改列的值（SMOKEFREE_DATE).我可以在列上循环执行此操作，但我真的很想知道如何使用 dplyr 和 mutate 执行此操作。任何帮助将不胜感激！

apcd_hud_ex = structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(16283, 
16283, 16071, 16071, 16648), class = "Date"), x2014_03_15 = c(1, 
1, 1, 0, 1), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(1, 
1, 1, 1, 1), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(1, 
1, 1, 1, 1), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(1, 
1, 1, 1, 1), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(1, 
1, 1, 1, 1), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(1, 
1, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", 
"data.frame"))

> apcd_hud_ex
# A tibble: 5 x 13
  studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
    <int> <date>               <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
1       1 2014-08-01               1           1           1           1           1           1           1           1
2       2 2014-08-01               1           1           1           1           1           1           1           1
3       3 2014-01-01               1           1           1           1           1           1           1           1
4       4 2014-01-01               0           1           1           1           1           1           1           1
5       5 2015-08-01               1           1           1           1           1           1           1           1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>
>


#function for loop
assign_PHRes_enrollIns_fn <- function(SFdate,insValue,insDate){
  val = if_else(insValue == 0,
                0,
                if_else(as.Date(insDate) < as.Date(SFdate,"%Y-%m-%d"),
                        1,
                        2))
  return(val)
}

#vectorized function
assign_PHRes_enrollIns_fn_vec <- Vectorize(assign_PHRes_enrollIns_fn)

dateCols = names(apcd_hud_ex)[which(names(apcd_hud_ex) == "x2014_03_15"):which(names(apcd_hud_ex) == "x2015_01_15")]

这个对列名 (dateCols) 的循环有效：

for(i in 1:length(dateCols)){
  dateCol = dateCols[i]
  insDate = as.Date(paste0(str_sub(dateCol,2,5),"/",str_sub(dateCol,7,8),"/",str_sub(dateCol,10,11)),"%Y/%m/%d")
  apcd_hud_ex[,dateCol] = assign_PHRes_enrollIns_fn_vec(apcd_hud_ex[,"SMOKEFREE_DATE"],apcd_hud_ex[,dateCol],insDate)
}

现在操作的数据框看起来像这样，这就是我想要的：

> apcd_hud_ex
# A tibble: 5 x 13
  studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
    <int> <date>               <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
1       1 2014-08-01               1           1           1           1           1           2           2           2
2       2 2014-08-01               1           1           1           1           1           2           2           2
3       3 2014-01-01               2           2           2           2           2           2           2           2
4       4 2014-01-01               0           2           2           2           2           2           2           2
5       5 2015-08-01               1           1           1           1           1           1           1           1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>

但是，我想学习如何使用动态规划和 dplyr 来做到这一点。我尝试了 2 个函数：

newInsValCols_fn1 <- function(df,dateCols){
  insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")

  df1 <- df %>%
    mutate({{dateCols}} := if_else({{dateCols}} == 0,
                                   0,
                                   if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
                                           1,
                                           2)))
 return(df1)
} 
newInsValCols_fn1(apcd_hud_ex,dateCols)

给出错误：

 Error: The LHS of `:=` must be a string or a symbol

所以我尝试使用符号：

newInsValCols_fn2 <- function(df,dateCols){
  dateCols_syms = syms(dateCols)
  insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")
  df1 <- df %>%
    mutate(!!dateCols_syms := if_else({{dateCols}} == 0,
                                      0,
                                      if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
                                              1,
                                              2)))
  return(df1)
} 
newInsValCols_fn2(apcd_hud_ex,dateCols)

给出同样的错误：

Error: The LHS of `:=` must be a string or a symbol

我也试过用!!!而不是 !!，但这导致了以下错误：

 Error: The LHS of `:=` can't be spliced with `!!!`

我的理解有些欠缺。

Answer 1

这是我使用 dplyr 的方法。

library(dplyr)
library(lubridate)

apcd_hud_ex %>%
  mutate(across(
    starts_with('x'),
    ~ case_when(. == 0 ~ 0,
                ymd(gsub('x', '', cur_column())) < SMOKEFREE_DATE ~ 1,
                TRUE ~ 2)
  ))

#> # A tibble: 5 x 13
#>   studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15 x2014_11_15 x2014_12_15 x2015_01_15
#>     <int> <date>               <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
#> 1       1 2014-08-01               1           1           1           1           1           2           2           2           2           2           2
#> 2       2 2014-08-01               1           1           1           1           1           2           2           2           2           2           2
#> 3       3 2014-01-01               2           2           2           2           2           2           2           2           2           2           2
#> 4       4 2014-01-01               0           2           2           2           2           2           2           2           2           2           2
#> 5       5 2015-08-01               1           1           1           1           1           1           1           1           1           1           1

Answer 2

您可以使用 pivot_longer 只修改一列，这是 mutate(across()) 的替代方法。

你可以使用case_when有多个条件，所以你不需要嵌套多个if语句。该值将是第一个 true 语句中的一个。

library(tidyverse)

apcd_hud_ex <- structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(
  16283,
  16283, 16071, 16071, 16648
), class = "Date"), x2014_03_15 = c(
  1,
  1, 1, 0, 1
), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(
  1,
  1, 1, 1, 1
), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(
  1,
  1, 1, 1, 1
), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(
  1,
  1, 1, 1, 1
), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(
  1,
  1, 1, 1, 1
), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(
  1,
  1, 1, 1, 1
)), row.names = c(NA, -5L), class = c(
  "tbl_df", "tbl",
  "data.frame"
))

apcd_hud_ex %>%
  pivot_longer(starts_with("x")) %>%
  mutate(
    insDate = name %>% str_remove("^x") %>% str_replace_all("_", "-") %>% as.Date(),
    value = case_when(
      value == 0 ~ 0,
      insDate < SMOKEFREE_DATE ~ 1,
      insDate >= SMOKEFREE_DATE ~ 2
    )
  ) %>%
  select(-insDate) %>%
  pivot_wider()
#> # A tibble: 5 × 13
#>   studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15
#>     <int> <date>               <dbl>       <dbl>       <dbl>       <dbl>
#> 1       1 2014-08-01               1           1           1           1
#> 2       2 2014-08-01               1           1           1           1
#> 3       3 2014-01-01               2           2           2           2
#> 4       4 2014-01-01               0           2           2           2
#> 5       5 2015-08-01               1           1           1           1
#> # … with 7 more variables: x2014_07_15 <dbl>, x2014_08_15 <dbl>,
#> #   x2014_09_15 <dbl>, x2014_10_15 <dbl>, x2014_11_15 <dbl>, x2014_12_15 <dbl>,
#> #   x2015_01_15 <dbl>

^{由 reprex package (v2.0.0)}

于 2022-05-05 创建

使用 dplyr 框架改变多列

Mutate multiple columns using the dplyr framework

r

dplyr