dplyr group_by 通过非标准评估的多个函数参数
dplyr group_by multiple function arguments via Non Standard Evaluation
我正在阅读 dplyr's vignette 试图找出如何在我的函数代码中使用 dplyr
。中途讨论了如何在 ...
上使用 enquos
以便将多个参数传递给 group_by。
它如何工作的一个简短示例
grp <- rlang::enquos(...)
df %>%
group_by(!!!grp)
我不知道是否有一种方法可以在不进行一些有问题的编码的情况下不保留 ...
分配多个表达式值。
要了解调用的外观,请使用以下示例:
#reproducable data
df <- datasets::USJudgeRatings
df$name <- rownames(df)
df <- tidyr::gather(df, key = "key", value = "value", -name)
df$dummy <- c("1","2")
test_summarize <- function(df, sum.col, grp = NULL, filter = NULL) {
filter <- rlang::enquo(filter)
sum.col <- rlang::enquo(sum.col)
if(!is.null(rlang::get_expr(filter))){
df <- dplyr::filter(df, !!filter)
}
#how grp is turned into a character vector to be passed to .dots in group_by
grp <- substitute(grp)
if(!is.null(grp)){
grp <- deparse(grp)
grp <- strsplit(gsub(pattern = "list\(|c\(|\)|", replacement = "", x = grp), split =",")[[1]]
grp <- gsub(pattern = "^ | $", replacement = "", x = grp)
df %>%
dplyr::group_by(.dots=grp) %>%
dplyr::summarise(mean = mean(!!sum.col), sum = sum(!!sum.col), n = n())
} else{
df %>%
dplyr::summarise(mean = mean(!!sum.col), sum = sum(!!sum.col), n = n())
}
}
test_summarize(df, sum.col=value, grp = c(name, dummy))
# A tibble: 86 x 5
# Groups: name [?]
name dummy mean sum n
<chr> <fct> <dbl> <dbl> <int>
1 AARONSON,L.H. 1 7.17 43 6
2 AARONSON,L.H. 2 7.42 44.5 6
3 ALEXANDER,J.M. 1 8.35 50.1 6
4 ALEXANDER,J.M. 2 7.95 47.7 6
5 ARMENTANO,A.J. 1 7.53 45.2 6
6 ARMENTANO,A.J. 2 7.7 46.2 6
7 BERDON,R.I. 1 8.67 52 6
8 BERDON,R.I. 2 8.25 49.5 6
9 BRACKEN,J.J. 1 5.65 33.9 6
10 BRACKEN,J.J. 2 5.82 34.9 6
# ... with 76 more rows
这适用于我尝试做的事情,但我想知道是否有更好的方法来接受参数并处理它们。我每次尝试将原始 grp
调用转换为类似于 enquos(...)
的调用都失败了,所以我进行了解析并将它们转换为字符向量,老实说,我可能只希望用户传递字符?
我选择不使用字符向量作为预期输入,因为考虑到 sum.col 和函数的过滤参数需要 NSE 表达式,我试图保持一致。也许 rlang 包中有一些东西可以将原始表达式的每个元素转换为一个 quosures 列表?
编辑:修复了可重现的示例并提供了预期输出
如果我们使用group_by_at
,我们可能不需要if/else
参数
test_summarize <- function(df, sum.col, grp = NULL, filter = NULL) {
df %>%
group_by_at(grp) %>%
summarise(mean = mean({{sum.col}}),
sum = sum({{sum.col}}), n = n())
}
test_summarize(df, sum.col=value, grp = c("name", "dummy"))
# A tibble: 86 x 5
# Groups: name [43]
# name dummy mean sum n
# <chr> <chr> <dbl> <dbl> <int>
# 1 AARONSON,L.H. 1 7.17 43 6
# 2 AARONSON,L.H. 2 7.42 44.5 6
# 3 ALEXANDER,J.M. 1 8.35 50.1 6
# 4 ALEXANDER,J.M. 2 7.95 47.7 6
# 5 ARMENTANO,A.J. 1 7.53 45.2 6
# 6 ARMENTANO,A.J. 2 7.7 46.2 6
# 7 BERDON,R.I. 1 8.67 52 6
# 8 BERDON,R.I. 2 8.25 49.5 6
# 9 BRACKEN,J.J. 1 5.65 33.9 6
#10 BRACKEN,J.J. 2 5.82 34.9 6
# … with 76 more rows
test_summarize(df, sum.col=value)
# A tibble: 1 x 3
# mean sum n
# <dbl> <dbl> <int>
#1 7.57 3908. 516
与
相同
df %>%
summarise(mean = mean(value), sum = sum(value), n = n())
# mean sum n
#1 7.57345 3907.9 516
如果我们使用filter
,那么一个选项是...
并通过尽可能多的过滤条件
test_summarize <- function(df, sum.col, grp = NULL, ...) {
df %>%
filter(!!! rlang::enexprs(...)) %>%
group_by_at(grp) %>%
summarise(mean = mean({{sum.col}}), sum = sum({{sum.col}}), n = n())
}
test_summarize(df, sum.col=value, grp = c("name", "dummy"),
key %in% c("CONT", "INTG"), value > 6.5)
# A tibble: 77 x 5
# Groups: name [43]
# name dummy mean sum n
# <chr> <chr> <dbl> <dbl> <int>
# 1 AARONSON,L.H. 2 7.9 7.9 1
# 2 ALEXANDER,J.M. 1 8.9 8.9 1
# 3 ALEXANDER,J.M. 2 6.8 6.8 1
# 4 ARMENTANO,A.J. 1 7.2 7.2 1
# 5 ARMENTANO,A.J. 2 8.1 8.1 1
# 6 BERDON,R.I. 1 8.8 8.8 1
# 7 BERDON,R.I. 2 6.8 6.8 1
# 8 BRACKEN,J.J. 1 7.3 7.3 1
# 9 BURNS,E.B. 1 8.8 8.8 1
#10 CALLAHAN,R.J. 1 10.6 10.6 1
# … with 67 more rows
这也会在没有过滤器参数时进行评估
test_summarize(df, sum.col=value, grp = c("name", "dummy"))
# A tibble: 86 x 5
# Groups: name [43]
# name dummy mean sum n
# <chr> <chr> <dbl> <dbl> <int>
# 1 AARONSON,L.H. 1 7.17 43 6
# 2 AARONSON,L.H. 2 7.42 44.5 6
# 3 ALEXANDER,J.M. 1 8.35 50.1 6
# 4 ALEXANDER,J.M. 2 7.95 47.7 6
# 5 ARMENTANO,A.J. 1 7.53 45.2 6
# 6 ARMENTANO,A.J. 2 7.7 46.2 6
# 7 BERDON,R.I. 1 8.67 52 6
# 8 BERDON,R.I. 2 8.25 49.5 6
# 9 BRACKEN,J.J. 1 5.65 33.9 6
#10 BRACKEN,J.J. 2 5.82 34.9 6
# … with 76 more rows
这和你第一次输出的一样
我正在阅读 dplyr's vignette 试图找出如何在我的函数代码中使用 dplyr
。中途讨论了如何在 ...
上使用 enquos
以便将多个参数传递给 group_by。
它如何工作的一个简短示例
grp <- rlang::enquos(...)
df %>%
group_by(!!!grp)
我不知道是否有一种方法可以在不进行一些有问题的编码的情况下不保留 ...
分配多个表达式值。
要了解调用的外观,请使用以下示例:
#reproducable data
df <- datasets::USJudgeRatings
df$name <- rownames(df)
df <- tidyr::gather(df, key = "key", value = "value", -name)
df$dummy <- c("1","2")
test_summarize <- function(df, sum.col, grp = NULL, filter = NULL) {
filter <- rlang::enquo(filter)
sum.col <- rlang::enquo(sum.col)
if(!is.null(rlang::get_expr(filter))){
df <- dplyr::filter(df, !!filter)
}
#how grp is turned into a character vector to be passed to .dots in group_by
grp <- substitute(grp)
if(!is.null(grp)){
grp <- deparse(grp)
grp <- strsplit(gsub(pattern = "list\(|c\(|\)|", replacement = "", x = grp), split =",")[[1]]
grp <- gsub(pattern = "^ | $", replacement = "", x = grp)
df %>%
dplyr::group_by(.dots=grp) %>%
dplyr::summarise(mean = mean(!!sum.col), sum = sum(!!sum.col), n = n())
} else{
df %>%
dplyr::summarise(mean = mean(!!sum.col), sum = sum(!!sum.col), n = n())
}
}
test_summarize(df, sum.col=value, grp = c(name, dummy))
# A tibble: 86 x 5
# Groups: name [?]
name dummy mean sum n
<chr> <fct> <dbl> <dbl> <int>
1 AARONSON,L.H. 1 7.17 43 6
2 AARONSON,L.H. 2 7.42 44.5 6
3 ALEXANDER,J.M. 1 8.35 50.1 6
4 ALEXANDER,J.M. 2 7.95 47.7 6
5 ARMENTANO,A.J. 1 7.53 45.2 6
6 ARMENTANO,A.J. 2 7.7 46.2 6
7 BERDON,R.I. 1 8.67 52 6
8 BERDON,R.I. 2 8.25 49.5 6
9 BRACKEN,J.J. 1 5.65 33.9 6
10 BRACKEN,J.J. 2 5.82 34.9 6
# ... with 76 more rows
这适用于我尝试做的事情,但我想知道是否有更好的方法来接受参数并处理它们。我每次尝试将原始 grp
调用转换为类似于 enquos(...)
的调用都失败了,所以我进行了解析并将它们转换为字符向量,老实说,我可能只希望用户传递字符?
我选择不使用字符向量作为预期输入,因为考虑到 sum.col 和函数的过滤参数需要 NSE 表达式,我试图保持一致。也许 rlang 包中有一些东西可以将原始表达式的每个元素转换为一个 quosures 列表?
编辑:修复了可重现的示例并提供了预期输出
如果我们使用group_by_at
,我们可能不需要if/else
参数
test_summarize <- function(df, sum.col, grp = NULL, filter = NULL) {
df %>%
group_by_at(grp) %>%
summarise(mean = mean({{sum.col}}),
sum = sum({{sum.col}}), n = n())
}
test_summarize(df, sum.col=value, grp = c("name", "dummy"))
# A tibble: 86 x 5
# Groups: name [43]
# name dummy mean sum n
# <chr> <chr> <dbl> <dbl> <int>
# 1 AARONSON,L.H. 1 7.17 43 6
# 2 AARONSON,L.H. 2 7.42 44.5 6
# 3 ALEXANDER,J.M. 1 8.35 50.1 6
# 4 ALEXANDER,J.M. 2 7.95 47.7 6
# 5 ARMENTANO,A.J. 1 7.53 45.2 6
# 6 ARMENTANO,A.J. 2 7.7 46.2 6
# 7 BERDON,R.I. 1 8.67 52 6
# 8 BERDON,R.I. 2 8.25 49.5 6
# 9 BRACKEN,J.J. 1 5.65 33.9 6
#10 BRACKEN,J.J. 2 5.82 34.9 6
# … with 76 more rows
test_summarize(df, sum.col=value)
# A tibble: 1 x 3
# mean sum n
# <dbl> <dbl> <int>
#1 7.57 3908. 516
与
相同df %>%
summarise(mean = mean(value), sum = sum(value), n = n())
# mean sum n
#1 7.57345 3907.9 516
如果我们使用filter
,那么一个选项是...
并通过尽可能多的过滤条件
test_summarize <- function(df, sum.col, grp = NULL, ...) {
df %>%
filter(!!! rlang::enexprs(...)) %>%
group_by_at(grp) %>%
summarise(mean = mean({{sum.col}}), sum = sum({{sum.col}}), n = n())
}
test_summarize(df, sum.col=value, grp = c("name", "dummy"),
key %in% c("CONT", "INTG"), value > 6.5)
# A tibble: 77 x 5
# Groups: name [43]
# name dummy mean sum n
# <chr> <chr> <dbl> <dbl> <int>
# 1 AARONSON,L.H. 2 7.9 7.9 1
# 2 ALEXANDER,J.M. 1 8.9 8.9 1
# 3 ALEXANDER,J.M. 2 6.8 6.8 1
# 4 ARMENTANO,A.J. 1 7.2 7.2 1
# 5 ARMENTANO,A.J. 2 8.1 8.1 1
# 6 BERDON,R.I. 1 8.8 8.8 1
# 7 BERDON,R.I. 2 6.8 6.8 1
# 8 BRACKEN,J.J. 1 7.3 7.3 1
# 9 BURNS,E.B. 1 8.8 8.8 1
#10 CALLAHAN,R.J. 1 10.6 10.6 1
# … with 67 more rows
这也会在没有过滤器参数时进行评估
test_summarize(df, sum.col=value, grp = c("name", "dummy"))
# A tibble: 86 x 5
# Groups: name [43]
# name dummy mean sum n
# <chr> <chr> <dbl> <dbl> <int>
# 1 AARONSON,L.H. 1 7.17 43 6
# 2 AARONSON,L.H. 2 7.42 44.5 6
# 3 ALEXANDER,J.M. 1 8.35 50.1 6
# 4 ALEXANDER,J.M. 2 7.95 47.7 6
# 5 ARMENTANO,A.J. 1 7.53 45.2 6
# 6 ARMENTANO,A.J. 2 7.7 46.2 6
# 7 BERDON,R.I. 1 8.67 52 6
# 8 BERDON,R.I. 2 8.25 49.5 6
# 9 BRACKEN,J.J. 1 5.65 33.9 6
#10 BRACKEN,J.J. 2 5.82 34.9 6
# … with 76 more rows
这和你第一次输出的一样