R tidyeval 将包含多个字符向量的列表传递给 dplyr 函数
R tidyeval passing a list containing multiple character vectors to dplyr functions
在这个问题中,我想在多个数据输入上映射一个函数,以创建显示项目响应频率的输出。
首先,我使用 psych
包创建了两个数据集:
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(psych))
set.seed(123)
data_input_sim <-
as_tibble(sim.poly.ideal(nvar = 50, n = 1000, cat = 4, )[["items"]]) %>%
mutate_all(
~ case_when(
.x == 0 ~ "never",
.x == 1 ~ "occasionally",
.x == 2 ~ "frequently",
.x == 3 ~ "always"
)
) %>%
rename_all( ~ str_c("i", str_pad(
as.character(1:50), 2, side = "left", pad = "0"
))) %>%
mutate(
ID = 100001:101000,
age = sample(c(5:12), 1000, replace = TRUE),
age_range = case_when(
age <=8 ~ "5 to 8 yo",
T ~ "9 to 12 yo"
),
gender = sample(
c("female", "male"),
1000,
replace = TRUE,
prob = c(0.53, 0.47)
),
educ = sample(
c("no_HS", "HS_grad", "some_college", "BA_plus"),
1000,
replace = TRUE,
prob = c(0.119, 0.263, 0.306, 0.311)
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
select(ID:clin_status, i01:i50)
data_input_bfi <- bfi %>%
drop_na() %>%
sample_n(1000) %>%
mutate(
ID = 200001:201000,
age_range = case_when(
age <= 18 ~ "18 yo or younger",
between(age, 19, 24) ~ "19 to 24 yo",
between(age, 25, 39) ~ "25 to 39 yo",
T ~ "40 yo or older"
),
gender = case_when(gender == 1 ~ "male",
gender == 2 ~ "female"),
educ = case_when(
education == 1 ~ "no_HS",
education == 2 ~ "HS_grad",
education == 3 ~ "some_college",
T ~ "BA_plus"
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
mutate_at(
vars(A1:O5),
~
case_when(
.x == 1 ~ "very_inaccurate",
.x == 2 ~ "moderately_inaccurate",
.x == 3 ~ "slightly_inaccurate",
.x == 4 ~ "slightly_accurate",
.x == 5 ~ "moderately_accurate",
.x == 6 ~ "very_accurate",
)
) %>%
select(ID, age:clin_status, A1:O5)
然后我提取每个数据集特有的元素并对其进行排序:其名称的后缀、项目列的名称以及项目类别的名称:
data_name_suffix <- c("sim", "bfi")
sim_item_cols <- str_c("i", str_pad(as.character(1:50), 2, side = "left", pad = "0"))
bfi_item_cols <- cross(list(c("A", "C", "E", "N", "O"), seq(1:5))) %>%
map_chr(str_c, collapse = "") %>%
sort()
sim_item_cats <- c("never", "occasionally","frequently", "always")
bfi_item_cats <- c("very_inaccurate", "moderately_inaccurate", "slightly_inaccurate",
"slightly_accurate", "moderately_accurate", "very_accurate")
data_name_suffix
是二元字符向量;然后我创建了两个元素的列表(使用 quos()
)来保存项目列和类别名称:
item_cols <- quos(sim_item_cols, bfi_item_cols)
item_cats <- quos(sim_item_cats, bfi_item_cats)
现在我尝试将输出创建函数映射到三个输入上,使用 purrr::pmap()
:
pmap_df(
list(data_name_suffix,
item_cols,
item_cats),
~
eval(as.name(str_c("data_input_", data_name_suffix))) %>%
select(!!!item_cols) %>%
gather(var, value) %>%
group_by(var, value) %>%
count(var, value) %>%
ungroup() %>%
spread(value, n) %>%
arrange(match(var, !!!item_cols)) %>%
select(var, !!!item_cats) %>%
assign(str_c("freq_item_val_", data_name_suffix), ., envir = .GlobalEnv)
)
它 returns 这个错误:
Error: Unknown columns `A1`, `A2`, `A3`, `A4`, `A5` and ...
这对我来说表明 R 将列表 item_cols
视为单个长字符向量,而不是要迭代的两个单独的字符向量。
这里我们达到了我对 tidyeval
技术的理解和经验的极限。我怀疑我在 quos()
和 !!!
.
上做错了什么
在此先感谢您的帮助,我希望阅读本文的人在这个超现实的时刻安全健康。
在这里,我们可以使用mget
来获取对象的值
library(stringr)
library(purrr)
library(dplyr)
library(tidyr)
list(mget(str_c('data_input_', data_name_suffix)),
item_cols,
item_cats) %>%
pmap(~ ..1 %>%
select(!!! ..2) %>%
pivot_longer(everything(), names_to = 'var', values_to = 'value') %>%
count(var, value) %>%
pivot_wider(names_from = value, values_from = n) %>%
arrange(match(var, !!!..2)) %>%
select(var, !!! ..3) )
#$data_input_sim
# A tibble: 50 x 5
# var never occasionally frequently always
# <chr> <int> <int> <int> <int>
# 1 i01 465 366 141 28
# 2 i02 489 336 147 28
# 3 i03 457 367 146 30
# 4 i04 433 385 162 20
# 5 i05 418 362 171 49
# 6 i06 420 369 169 42
# 7 i07 405 367 182 46
# 8 i08 361 401 194 44
# 9 i09 346 391 211 52
#10 i10 334 425 203 38
# … with 40 more rows
#$data_input_bfi
# A tibble: 25 x 7
# var very_inaccurate moderately_inaccurate slightly_inaccurate slightly_accurate moderately_accurate very_accurate
# <chr> <int> <int> <int> <int> #<int> <int>
# 1 A1 334 278 151 130 75 32
# 2 A2 18 49 48 197 365 323
# 3 A3 32 51 72 210 353 282
# 4 A4 48 69 60 159 243 421
# 5 A5 26 66 89 207 340 272
# 6 C1 17 48 82 213 383 257
# 7 C2 26 85 98 212 361 218
# 8 C3 35 80 102 272 322 189
# 9 C4 296 270 166 163 83 22
#10 C5 197 212 118 207 167 99
# … with 15 more rows
注意:assign
不建议创建多个对象。而是将输出保留在 list
中,并通过使用 map
循环遍历每个 list
元素(如果需要)进行更改
在这个问题中,我想在多个数据输入上映射一个函数,以创建显示项目响应频率的输出。
首先,我使用 psych
包创建了两个数据集:
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(psych))
set.seed(123)
data_input_sim <-
as_tibble(sim.poly.ideal(nvar = 50, n = 1000, cat = 4, )[["items"]]) %>%
mutate_all(
~ case_when(
.x == 0 ~ "never",
.x == 1 ~ "occasionally",
.x == 2 ~ "frequently",
.x == 3 ~ "always"
)
) %>%
rename_all( ~ str_c("i", str_pad(
as.character(1:50), 2, side = "left", pad = "0"
))) %>%
mutate(
ID = 100001:101000,
age = sample(c(5:12), 1000, replace = TRUE),
age_range = case_when(
age <=8 ~ "5 to 8 yo",
T ~ "9 to 12 yo"
),
gender = sample(
c("female", "male"),
1000,
replace = TRUE,
prob = c(0.53, 0.47)
),
educ = sample(
c("no_HS", "HS_grad", "some_college", "BA_plus"),
1000,
replace = TRUE,
prob = c(0.119, 0.263, 0.306, 0.311)
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
select(ID:clin_status, i01:i50)
data_input_bfi <- bfi %>%
drop_na() %>%
sample_n(1000) %>%
mutate(
ID = 200001:201000,
age_range = case_when(
age <= 18 ~ "18 yo or younger",
between(age, 19, 24) ~ "19 to 24 yo",
between(age, 25, 39) ~ "25 to 39 yo",
T ~ "40 yo or older"
),
gender = case_when(gender == 1 ~ "male",
gender == 2 ~ "female"),
educ = case_when(
education == 1 ~ "no_HS",
education == 2 ~ "HS_grad",
education == 3 ~ "some_college",
T ~ "BA_plus"
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
mutate_at(
vars(A1:O5),
~
case_when(
.x == 1 ~ "very_inaccurate",
.x == 2 ~ "moderately_inaccurate",
.x == 3 ~ "slightly_inaccurate",
.x == 4 ~ "slightly_accurate",
.x == 5 ~ "moderately_accurate",
.x == 6 ~ "very_accurate",
)
) %>%
select(ID, age:clin_status, A1:O5)
然后我提取每个数据集特有的元素并对其进行排序:其名称的后缀、项目列的名称以及项目类别的名称:
data_name_suffix <- c("sim", "bfi")
sim_item_cols <- str_c("i", str_pad(as.character(1:50), 2, side = "left", pad = "0"))
bfi_item_cols <- cross(list(c("A", "C", "E", "N", "O"), seq(1:5))) %>%
map_chr(str_c, collapse = "") %>%
sort()
sim_item_cats <- c("never", "occasionally","frequently", "always")
bfi_item_cats <- c("very_inaccurate", "moderately_inaccurate", "slightly_inaccurate",
"slightly_accurate", "moderately_accurate", "very_accurate")
data_name_suffix
是二元字符向量;然后我创建了两个元素的列表(使用 quos()
)来保存项目列和类别名称:
item_cols <- quos(sim_item_cols, bfi_item_cols)
item_cats <- quos(sim_item_cats, bfi_item_cats)
现在我尝试将输出创建函数映射到三个输入上,使用 purrr::pmap()
:
pmap_df(
list(data_name_suffix,
item_cols,
item_cats),
~
eval(as.name(str_c("data_input_", data_name_suffix))) %>%
select(!!!item_cols) %>%
gather(var, value) %>%
group_by(var, value) %>%
count(var, value) %>%
ungroup() %>%
spread(value, n) %>%
arrange(match(var, !!!item_cols)) %>%
select(var, !!!item_cats) %>%
assign(str_c("freq_item_val_", data_name_suffix), ., envir = .GlobalEnv)
)
它 returns 这个错误:
Error: Unknown columns `A1`, `A2`, `A3`, `A4`, `A5` and ...
这对我来说表明 R 将列表 item_cols
视为单个长字符向量,而不是要迭代的两个单独的字符向量。
这里我们达到了我对 tidyeval
技术的理解和经验的极限。我怀疑我在 quos()
和 !!!
.
在此先感谢您的帮助,我希望阅读本文的人在这个超现实的时刻安全健康。
在这里,我们可以使用mget
来获取对象的值
library(stringr)
library(purrr)
library(dplyr)
library(tidyr)
list(mget(str_c('data_input_', data_name_suffix)),
item_cols,
item_cats) %>%
pmap(~ ..1 %>%
select(!!! ..2) %>%
pivot_longer(everything(), names_to = 'var', values_to = 'value') %>%
count(var, value) %>%
pivot_wider(names_from = value, values_from = n) %>%
arrange(match(var, !!!..2)) %>%
select(var, !!! ..3) )
#$data_input_sim
# A tibble: 50 x 5
# var never occasionally frequently always
# <chr> <int> <int> <int> <int>
# 1 i01 465 366 141 28
# 2 i02 489 336 147 28
# 3 i03 457 367 146 30
# 4 i04 433 385 162 20
# 5 i05 418 362 171 49
# 6 i06 420 369 169 42
# 7 i07 405 367 182 46
# 8 i08 361 401 194 44
# 9 i09 346 391 211 52
#10 i10 334 425 203 38
# … with 40 more rows
#$data_input_bfi
# A tibble: 25 x 7
# var very_inaccurate moderately_inaccurate slightly_inaccurate slightly_accurate moderately_accurate very_accurate
# <chr> <int> <int> <int> <int> #<int> <int>
# 1 A1 334 278 151 130 75 32
# 2 A2 18 49 48 197 365 323
# 3 A3 32 51 72 210 353 282
# 4 A4 48 69 60 159 243 421
# 5 A5 26 66 89 207 340 272
# 6 C1 17 48 82 213 383 257
# 7 C2 26 85 98 212 361 218
# 8 C3 35 80 102 272 322 189
# 9 C4 296 270 166 163 83 22
#10 C5 197 212 118 207 167 99
# … with 15 more rows
注意:assign
不建议创建多个对象。而是将输出保留在 list
中,并通过使用 map
list
元素(如果需要)进行更改