R tidyeval 将包含多个字符向量的列表传递给 dplyr 函数

R tidyeval passing a list containing multiple character vectors to dplyr functions

在这个问题中,我想在多个数据输入上映射一个函数,以创建显示项目响应频率的输出。

首先,我使用 psych 包创建了两个数据集:

suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(psych))

set.seed(123)

data_input_sim <-
  as_tibble(sim.poly.ideal(nvar = 50, n = 1000, cat = 4, )[["items"]]) %>%
  mutate_all(
    ~ case_when(
      .x == 0 ~ "never",
      .x == 1 ~ "occasionally",
      .x == 2 ~ "frequently",
      .x == 3 ~ "always"
    )
  ) %>%
  rename_all( ~ str_c("i", str_pad(
    as.character(1:50), 2, side = "left", pad = "0"
  ))) %>%
  mutate(
    ID = 100001:101000,
    age = sample(c(5:12), 1000, replace = TRUE),
    age_range = case_when(
      age <=8 ~ "5 to 8 yo",
      T ~ "9 to 12 yo"
    ),
    gender = sample(
      c("female", "male"),
      1000,
      replace = TRUE,
      prob = c(0.53, 0.47)
    ),
    educ = sample(
      c("no_HS", "HS_grad", "some_college", "BA_plus"),
      1000,
      replace = TRUE,
      prob = c(0.119, 0.263, 0.306, 0.311)
    ),
    ethnic = sample(
      c("hispanic", "asian", "black", "white", "other"),
      1000,
      replace = TRUE,
      prob = c(0.239, 0.048, 0.136, 0.521, .056)
    ),
    region = sample(
      c("northeast", "south", "midwest", "west"),
      1000,
      replace = TRUE,
      prob = c(0.166, 0.383, 0.212, 0.238)
    ),
    clin_status = sample(
      c("typ", "clin"),
      1000,
      replace = TRUE,
      prob = c(0.8, 0.2)
    )
  ) %>%
  select(ID:clin_status, i01:i50)

data_input_bfi <- bfi %>%
  drop_na() %>%
  sample_n(1000) %>%
  mutate(
    ID = 200001:201000,
    age_range = case_when(
      age <= 18 ~ "18 yo or younger",
      between(age, 19, 24) ~ "19 to 24 yo",
      between(age, 25, 39) ~ "25 to 39 yo",
      T ~ "40 yo or older"
    ),
    gender = case_when(gender == 1 ~ "male",
                       gender == 2 ~ "female"),
    educ = case_when(
      education == 1 ~ "no_HS",
      education == 2 ~ "HS_grad",
      education == 3 ~ "some_college",
      T ~ "BA_plus"
    ),
    ethnic = sample(
      c("hispanic", "asian", "black", "white", "other"),
      1000,
      replace = TRUE,
      prob = c(0.239, 0.048, 0.136, 0.521, .056)
    ),
    region = sample(
      c("northeast", "south", "midwest", "west"),
      1000,
      replace = TRUE,
      prob = c(0.166, 0.383, 0.212, 0.238)
    ),
    clin_status = sample(
      c("typ", "clin"),
      1000,
      replace = TRUE,
      prob = c(0.8, 0.2)
    )
  ) %>%
  mutate_at(
    vars(A1:O5),
    ~
      case_when(
        .x == 1 ~ "very_inaccurate",
        .x == 2 ~ "moderately_inaccurate",
        .x == 3 ~ "slightly_inaccurate",
        .x == 4 ~ "slightly_accurate",
        .x == 5 ~ "moderately_accurate",
        .x == 6 ~ "very_accurate",
      )
  ) %>% 
  select(ID, age:clin_status, A1:O5)

然后我提取每个数据集特有的元素并对其进行排序:其名称的后缀、项目列的名称以及项目类别的名称:

data_name_suffix <- c("sim", "bfi")

sim_item_cols <- str_c("i", str_pad(as.character(1:50), 2, side = "left", pad = "0"))
bfi_item_cols <- cross(list(c("A", "C", "E", "N", "O"), seq(1:5))) %>%
  map_chr(str_c, collapse = "") %>% 
  sort()

sim_item_cats <- c("never", "occasionally","frequently", "always")
bfi_item_cats <- c("very_inaccurate", "moderately_inaccurate", "slightly_inaccurate",
                  "slightly_accurate", "moderately_accurate", "very_accurate")

data_name_suffix 是二元字符向量;然后我创建了两个元素的列表(使用 quos())来保存项目列和类别名称:

item_cols <- quos(sim_item_cols, bfi_item_cols)
item_cats <- quos(sim_item_cats, bfi_item_cats)

现在我尝试将输出创建函数映射到三个输入上,使用 purrr::pmap():

pmap_df(
  list(data_name_suffix,
       item_cols,
       item_cats),
  ~
    eval(as.name(str_c("data_input_", data_name_suffix))) %>%
    select(!!!item_cols) %>%
    gather(var, value) %>%
    group_by(var, value) %>%
    count(var, value) %>%
    ungroup() %>%
    spread(value, n) %>%
    arrange(match(var, !!!item_cols)) %>%
    select(var, !!!item_cats) %>% 
    assign(str_c("freq_item_val_", data_name_suffix), ., envir = .GlobalEnv)
)

它 returns 这个错误:

Error: Unknown columns `A1`, `A2`, `A3`, `A4`, `A5` and ... 

这对我来说表明 R 将列表 item_cols 视为单个长字符向量,而不是要迭代的两个单独的字符向量。

这里我们达到了我对 tidyeval 技术的理解和经验的极限。我怀疑我在 quos()!!!.

上做错了什么

在此先感谢您的帮助,我希望阅读本文的人在这个超现实的时刻安全健康。

在这里,我们可以使用mget来获取对象的值

library(stringr)
library(purrr)
library(dplyr)
library(tidyr)
list(mget(str_c('data_input_', data_name_suffix)),
     item_cols,
   item_cats) %>%
     pmap(~ ..1 %>% 
            select(!!! ..2) %>% 
            pivot_longer(everything(), names_to = 'var', values_to = 'value') %>% 
            count(var, value) %>% 
            pivot_wider(names_from = value, values_from = n) %>% 
            arrange(match(var, !!!..2)) %>%
            select(var, !!! ..3) )
#$data_input_sim
# A tibble: 50 x 5
#   var   never occasionally frequently always
#   <chr> <int>        <int>      <int>  <int>
# 1 i01     465          366        141     28
# 2 i02     489          336        147     28
# 3 i03     457          367        146     30
# 4 i04     433          385        162     20
# 5 i05     418          362        171     49
# 6 i06     420          369        169     42
# 7 i07     405          367        182     46
# 8 i08     361          401        194     44
# 9 i09     346          391        211     52
#10 i10     334          425        203     38
# … with 40 more rows

#$data_input_bfi
# A tibble: 25 x 7
#   var   very_inaccurate moderately_inaccurate slightly_inaccurate slightly_accurate moderately_accurate very_accurate
#   <chr>           <int>                 <int>               <int>             <int>               #<int>         <int>
# 1 A1                334                   278                 151               130                  75            32
# 2 A2                 18                    49                  48               197                 365           323
# 3 A3                 32                    51                  72               210                 353           282
# 4 A4                 48                    69                  60               159                 243           421
# 5 A5                 26                    66                  89               207                 340           272
# 6 C1                 17                    48                  82               213                 383           257
# 7 C2                 26                    85                  98               212                 361           218
# 8 C3                 35                    80                 102               272                 322           189
# 9 C4                296                   270                 166               163                  83            22
#10 C5                197                   212                 118               207                 167            99
# … with 15 more rows

注意:assign不建议创建多个对象。而是将输出保留在 list 中,并通过使用 map

循环遍历每个 list 元素(如果需要)进行更改