rlang:从...获取名称,在 NSE 函数中使用冒号快捷方式
rlang: Get names from ... with colon shortcut in NSE function
我正在编写一个函数包来制作人口统计数据表。我有一个函数,缩写如下,我需要在其中包含多个列 (...
),在这些列上我将 gather
一个数据框。诀窍是我想按顺序排列这些列的名称,因为我需要在收集后按该顺序放置一个列。在这种情况下,这些列是 estimate
、moe
、share
、sharemoe
.
library(tidyverse)
library(rlang)
race <- structure(list(region = c("New Haven", "New Haven", "New Haven", "New Haven", "Outer Ring", "Outer Ring", "Outer Ring", "Outer Ring"),
variable = c("white", "black", "asian", "latino", "white", "black", "asian", "latino"),
estimate = c(40164, 42970, 6042, 37231, 164150, 3471, 9565, 8518),
moe = c(1395, 1383, 697, 1688, 1603, 677, 896, 1052),
share = c(0.308, 0.33, 0.046, 0.286, 0.87, 0.018, 0.051, 0.045),
sharemoe = c(0.011, 0.011, 0.005, 0.013, 0.008, 0.004, 0.005, 0.006)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -8L))
race
#> # A tibble: 8 x 6
#> region variable estimate moe share sharemoe
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 New Haven white 40164 1395 0.308 0.011
#> 2 New Haven black 42970 1383 0.33 0.011
#> 3 New Haven asian 6042 697 0.046 0.005
#> 4 New Haven latino 37231 1688 0.286 0.013
#> 5 Outer Ring white 164150 1603 0.87 0.008
#> 6 Outer Ring black 3471 677 0.018 0.004
#> 7 Outer Ring asian 9565 896 0.051 0.005
#> 8 Outer Ring latino 8518 1052 0.045 0.006
在函数 gather_arrange
中,我通过映射 rlang::exprs(...)
并转换为字符来获取 ...
列的名称。很难将这些列的名称提取为字符串,因此这可能是一个需要改进或重写的地方。但这按我想要的方式工作,使列 type
成为具有级别 estimate
、moe
、share
、sharemoe
的因子。[=29] =]
gather_arrange <- function(df, ..., group = variable) {
gather_cols <- rlang::quos(...)
grp_var <- rlang::enquo(group)
gather_names <- purrr::map_chr(rlang::exprs(...), as.character)
df %>%
tidyr::gather(key = type, value = value, !!!gather_cols) %>%
dplyr::mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
forcats::fct_inorder() %>% forcats::fct_rev()) %>%
dplyr::mutate(type = as.factor(type) %>% forcats::fct_relevel(gather_names)) %>%
arrange(type)
}
race %>% gather_arrange(estimate, moe, share, sharemoe)
#> # A tibble: 32 x 4
#> region variable type value
#> <chr> <fct> <fct> <dbl>
#> 1 New Haven white estimate 40164
#> 2 New Haven black estimate 42970
#> 3 New Haven asian estimate 6042
#> 4 New Haven latino estimate 37231
#> 5 Outer Ring white estimate 164150
#> 6 Outer Ring black estimate 3471
#> 7 Outer Ring asian estimate 9565
#> 8 Outer Ring latino estimate 8518
#> 9 New Haven white moe 1395
#> 10 New Haven black moe 1383
#> # ... with 22 more rows
但我希望还可以使用冒号符号来选择列,即 estimate:sharemoe
相当于输入所有这些列名称。
race %>% gather_arrange(estimate:sharemoe)
#> Error: Result 1 is not a length 1 atomic vector
这失败了,因为它无法从 rlang::exprs(...)
中提取列名。如何使用此表示法获取列名?提前致谢!
fun <- function(df, ...){
as.character(substitute(list(...)))[-1] %>%
lapply(function(x)
if(!grepl(':', x)) x
else strsplit(x, ':')[[1]] %>%
lapply(match, names(df)) %>%
{names(df)[do.call(seq, .)]})%>%
unlist
}
names(race)
# [1] "region" "variable" "estimate" "moe" "share" "sharemoe"
fun(race, estimate:sharemoe, region)
# [1] "estimate" "moe" "share" "sharemoe" "region"
fun(race, estimate, moe, share, sharemoe, region)
# [1] "estimate" "moe" "share" "sharemoe" "region"
fun(race, moe, region:variable)
# [1] "moe" "region" "variable"
这涉及将 :
符号表达式和其他列名作为参数,例如fun(race, estimate:sharemoe, region)
.
有趣的是,这个 hacky 解决方案似乎比 tidyselect
更快(并不是变量选择可能是整体速度的痛点)
fun <- function(y, ...){
as.character(substitute(list(...)))[-1] %>%
lapply(function(x)
if(!grepl(':', x)) x
else strsplit(x, ':')[[1]] %>%
lapply(match, y) %>%
{y[do.call(seq, .)]})%>%
unlist
}
library(microbenchmark)
microbenchmark(
tidy = tidyselect::vars_select(letters, b, g:j, a),
fun = fun(letters, b, g:j, a),
unit = 'relative')
# Unit: relative
# expr min lq mean median uq max neval
# tidy 19.90837 18.10964 15.32737 14.28823 13.86212 14.44013 100
# fun 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 100
原函数
gather_arrange <- function(df, ..., group = variable) {
gather_cols <- rlang::quos(...)
grp_var <- rlang::enquo(group)
gather_names <- purrr::map_chr(rlang::exprs(...), as.character)
df %>%
tidyr::gather(key = type, value = value, !!!gather_cols) %>%
dplyr::mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
forcats::fct_inorder() %>% forcats::fct_rev()) %>%
dplyr::mutate(type = as.factor(type) %>% forcats::fct_relevel(gather_names)) %>%
arrange(type)
}
使用上面定义的函数fun
:
my_gather_arrange <- function(df, ..., group = variable) {
gather_cols <- gather_names <-
as.character(substitute(list(...)))[-1] %>%
lapply(function(x){
if(grepl(':', x)){
strsplit(x, ':')[[1]] %>%
lapply(match, names(df)) %>%
{names(df)[do.call(seq, .)]}}
else x}) %>%
unlist
grp_var <- rlang::enquo(group)
df %>%
tidyr::gather(key = type, value = value, !!!gather_cols) %>%
dplyr::mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
forcats::fct_inorder() %>% forcats::fct_rev()) %>%
dplyr::mutate(type = as.factor(type) %>% forcats::fct_relevel(gather_names)) %>%
arrange(type)
}
out1 <- gather_arrange(race, estimate, moe, share, sharemoe, region)
out2 <- my_gather_arrange(race, estimate:sharemoe, region)
#
identical(out1, out2)
# [1] TRUE
我们可以为 :
的情况创建一个 if
条件,从 select
中获取列名 ('gather_names') 以用于 [=18] =]
gather_arrange <- function(df, group = variable, ...) {
gather_cols <- quos(...)
grp_var <- enquo(group)
if(length(gather_cols)==1 && grepl(":", quo_name(gather_cols[[1]]))) {
gather_cols <- parse_expr(quo_name(gather_cols[[1]]))
}
gather_names <- df %>%
select(!!! gather_cols) %>%
names
df %>%
gather(key = type, value = value, !!!gather_cols) %>%
mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
fct_inorder() %>%
fct_rev()) %>%
mutate(type = as.factor(type) %>%
fct_relevel(gather_names)) %>%
arrange(type)
}
-检查
out1 <- gather_arrange(df = race, group = variable,
estimate, moe, share, sharemoe)
out1
# A tibble: 32 x 4
# region variable type value
# <chr> <fct> <fct> <dbl>
# 1 New Haven white estimate 40164
# 2 New Haven black estimate 42970
# 3 New Haven asian estimate 6042
# 4 New Haven latino estimate 37231
# 5 Outer Ring white estimate 164150
# 6 Outer Ring black estimate 3471
# 7 Outer Ring asian estimate 9565
# 8 Outer Ring latino estimate 8518
# 9 New Haven white moe 1395
#10 New Haven black moe 1383
# ... with 22 more rows
out2 <- gather_arrange(df = race, group = variable, estimate:sharemoe)
identical(out1, out2)
#[1] TRUE
更新
如果我们在 ...
中传递多组列
gather_arrange2 <- function(df, group = variable, ...) {
gather_cols <- quos(...)
grp_var <- enquo(group)
gather_names <- df %>%
select(!!! gather_cols) %>%
names
gather_colsN <- lapply(gather_cols, function(x) parse_expr(quo_name(x)))
df %>%
gather(key = type, value = value, !!!gather_colsN) %>%
mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
fct_inorder() %>%
fct_rev()) %>%
mutate(type = as.factor(type) %>%
fct_relevel(gather_names)) %>%
arrange(type)
}
-检查
out1 <- gather_arrange2(df = race, group = variable,
estimate, moe, share, sharemoe, region)
out2 <- gather_arrange2(df = race, group = variable, estimate:sharemoe, region)
identical(out1, out2)
#[1] TRUE
或者只检查一组列
out1 <- gather_arrange2(df = race, group = variable,
estimate, moe, share, sharemoe)
out2 <- gather_arrange2(df = race, group = variable, estimate:sharemoe)
identical(out1, out2)
#[1] TRUE
我认为您正在寻找的函数是 tidyselect::vars_select()
,它被 select 内部使用并重命名以完成此任务。它 returns 变量名称的字符向量。例如:
> tidyselect::vars_select(letters, g:j)
g h i j
"g" "h" "i" "j"
这允许您使用对 dplyr::select
有效的所有相同语法。
我正在编写一个函数包来制作人口统计数据表。我有一个函数,缩写如下,我需要在其中包含多个列 (...
),在这些列上我将 gather
一个数据框。诀窍是我想按顺序排列这些列的名称,因为我需要在收集后按该顺序放置一个列。在这种情况下,这些列是 estimate
、moe
、share
、sharemoe
.
library(tidyverse)
library(rlang)
race <- structure(list(region = c("New Haven", "New Haven", "New Haven", "New Haven", "Outer Ring", "Outer Ring", "Outer Ring", "Outer Ring"),
variable = c("white", "black", "asian", "latino", "white", "black", "asian", "latino"),
estimate = c(40164, 42970, 6042, 37231, 164150, 3471, 9565, 8518),
moe = c(1395, 1383, 697, 1688, 1603, 677, 896, 1052),
share = c(0.308, 0.33, 0.046, 0.286, 0.87, 0.018, 0.051, 0.045),
sharemoe = c(0.011, 0.011, 0.005, 0.013, 0.008, 0.004, 0.005, 0.006)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -8L))
race
#> # A tibble: 8 x 6
#> region variable estimate moe share sharemoe
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 New Haven white 40164 1395 0.308 0.011
#> 2 New Haven black 42970 1383 0.33 0.011
#> 3 New Haven asian 6042 697 0.046 0.005
#> 4 New Haven latino 37231 1688 0.286 0.013
#> 5 Outer Ring white 164150 1603 0.87 0.008
#> 6 Outer Ring black 3471 677 0.018 0.004
#> 7 Outer Ring asian 9565 896 0.051 0.005
#> 8 Outer Ring latino 8518 1052 0.045 0.006
在函数 gather_arrange
中,我通过映射 rlang::exprs(...)
并转换为字符来获取 ...
列的名称。很难将这些列的名称提取为字符串,因此这可能是一个需要改进或重写的地方。但这按我想要的方式工作,使列 type
成为具有级别 estimate
、moe
、share
、sharemoe
的因子。[=29] =]
gather_arrange <- function(df, ..., group = variable) {
gather_cols <- rlang::quos(...)
grp_var <- rlang::enquo(group)
gather_names <- purrr::map_chr(rlang::exprs(...), as.character)
df %>%
tidyr::gather(key = type, value = value, !!!gather_cols) %>%
dplyr::mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
forcats::fct_inorder() %>% forcats::fct_rev()) %>%
dplyr::mutate(type = as.factor(type) %>% forcats::fct_relevel(gather_names)) %>%
arrange(type)
}
race %>% gather_arrange(estimate, moe, share, sharemoe)
#> # A tibble: 32 x 4
#> region variable type value
#> <chr> <fct> <fct> <dbl>
#> 1 New Haven white estimate 40164
#> 2 New Haven black estimate 42970
#> 3 New Haven asian estimate 6042
#> 4 New Haven latino estimate 37231
#> 5 Outer Ring white estimate 164150
#> 6 Outer Ring black estimate 3471
#> 7 Outer Ring asian estimate 9565
#> 8 Outer Ring latino estimate 8518
#> 9 New Haven white moe 1395
#> 10 New Haven black moe 1383
#> # ... with 22 more rows
但我希望还可以使用冒号符号来选择列,即 estimate:sharemoe
相当于输入所有这些列名称。
race %>% gather_arrange(estimate:sharemoe)
#> Error: Result 1 is not a length 1 atomic vector
这失败了,因为它无法从 rlang::exprs(...)
中提取列名。如何使用此表示法获取列名?提前致谢!
fun <- function(df, ...){
as.character(substitute(list(...)))[-1] %>%
lapply(function(x)
if(!grepl(':', x)) x
else strsplit(x, ':')[[1]] %>%
lapply(match, names(df)) %>%
{names(df)[do.call(seq, .)]})%>%
unlist
}
names(race)
# [1] "region" "variable" "estimate" "moe" "share" "sharemoe"
fun(race, estimate:sharemoe, region)
# [1] "estimate" "moe" "share" "sharemoe" "region"
fun(race, estimate, moe, share, sharemoe, region)
# [1] "estimate" "moe" "share" "sharemoe" "region"
fun(race, moe, region:variable)
# [1] "moe" "region" "variable"
这涉及将 :
符号表达式和其他列名作为参数,例如fun(race, estimate:sharemoe, region)
.
有趣的是,这个 hacky 解决方案似乎比 tidyselect
更快(并不是变量选择可能是整体速度的痛点)
fun <- function(y, ...){
as.character(substitute(list(...)))[-1] %>%
lapply(function(x)
if(!grepl(':', x)) x
else strsplit(x, ':')[[1]] %>%
lapply(match, y) %>%
{y[do.call(seq, .)]})%>%
unlist
}
library(microbenchmark)
microbenchmark(
tidy = tidyselect::vars_select(letters, b, g:j, a),
fun = fun(letters, b, g:j, a),
unit = 'relative')
# Unit: relative
# expr min lq mean median uq max neval
# tidy 19.90837 18.10964 15.32737 14.28823 13.86212 14.44013 100
# fun 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 100
原函数
gather_arrange <- function(df, ..., group = variable) {
gather_cols <- rlang::quos(...)
grp_var <- rlang::enquo(group)
gather_names <- purrr::map_chr(rlang::exprs(...), as.character)
df %>%
tidyr::gather(key = type, value = value, !!!gather_cols) %>%
dplyr::mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
forcats::fct_inorder() %>% forcats::fct_rev()) %>%
dplyr::mutate(type = as.factor(type) %>% forcats::fct_relevel(gather_names)) %>%
arrange(type)
}
使用上面定义的函数fun
:
my_gather_arrange <- function(df, ..., group = variable) {
gather_cols <- gather_names <-
as.character(substitute(list(...)))[-1] %>%
lapply(function(x){
if(grepl(':', x)){
strsplit(x, ':')[[1]] %>%
lapply(match, names(df)) %>%
{names(df)[do.call(seq, .)]}}
else x}) %>%
unlist
grp_var <- rlang::enquo(group)
df %>%
tidyr::gather(key = type, value = value, !!!gather_cols) %>%
dplyr::mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
forcats::fct_inorder() %>% forcats::fct_rev()) %>%
dplyr::mutate(type = as.factor(type) %>% forcats::fct_relevel(gather_names)) %>%
arrange(type)
}
out1 <- gather_arrange(race, estimate, moe, share, sharemoe, region)
out2 <- my_gather_arrange(race, estimate:sharemoe, region)
#
identical(out1, out2)
# [1] TRUE
我们可以为 :
的情况创建一个 if
条件,从 select
中获取列名 ('gather_names') 以用于 [=18] =]
gather_arrange <- function(df, group = variable, ...) {
gather_cols <- quos(...)
grp_var <- enquo(group)
if(length(gather_cols)==1 && grepl(":", quo_name(gather_cols[[1]]))) {
gather_cols <- parse_expr(quo_name(gather_cols[[1]]))
}
gather_names <- df %>%
select(!!! gather_cols) %>%
names
df %>%
gather(key = type, value = value, !!!gather_cols) %>%
mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
fct_inorder() %>%
fct_rev()) %>%
mutate(type = as.factor(type) %>%
fct_relevel(gather_names)) %>%
arrange(type)
}
-检查
out1 <- gather_arrange(df = race, group = variable,
estimate, moe, share, sharemoe)
out1
# A tibble: 32 x 4
# region variable type value
# <chr> <fct> <fct> <dbl>
# 1 New Haven white estimate 40164
# 2 New Haven black estimate 42970
# 3 New Haven asian estimate 6042
# 4 New Haven latino estimate 37231
# 5 Outer Ring white estimate 164150
# 6 Outer Ring black estimate 3471
# 7 Outer Ring asian estimate 9565
# 8 Outer Ring latino estimate 8518
# 9 New Haven white moe 1395
#10 New Haven black moe 1383
# ... with 22 more rows
out2 <- gather_arrange(df = race, group = variable, estimate:sharemoe)
identical(out1, out2)
#[1] TRUE
更新
如果我们在 ...
gather_arrange2 <- function(df, group = variable, ...) {
gather_cols <- quos(...)
grp_var <- enquo(group)
gather_names <- df %>%
select(!!! gather_cols) %>%
names
gather_colsN <- lapply(gather_cols, function(x) parse_expr(quo_name(x)))
df %>%
gather(key = type, value = value, !!!gather_colsN) %>%
mutate(!!rlang::quo_name(grp_var) := !!grp_var %>%
fct_inorder() %>%
fct_rev()) %>%
mutate(type = as.factor(type) %>%
fct_relevel(gather_names)) %>%
arrange(type)
}
-检查
out1 <- gather_arrange2(df = race, group = variable,
estimate, moe, share, sharemoe, region)
out2 <- gather_arrange2(df = race, group = variable, estimate:sharemoe, region)
identical(out1, out2)
#[1] TRUE
或者只检查一组列
out1 <- gather_arrange2(df = race, group = variable,
estimate, moe, share, sharemoe)
out2 <- gather_arrange2(df = race, group = variable, estimate:sharemoe)
identical(out1, out2)
#[1] TRUE
我认为您正在寻找的函数是 tidyselect::vars_select()
,它被 select 内部使用并重命名以完成此任务。它 returns 变量名称的字符向量。例如:
> tidyselect::vars_select(letters, g:j)
g h i j
"g" "h" "i" "j"
这允许您使用对 dplyr::select
有效的所有相同语法。