试图了解 eval(expr, envir = df) 的工作原理
Trying to understand how eval(expr, envir = df) works
我构建了一个似乎有效的函数,但我不明白为什么。
我最初的问题是获取包含人口计数的 data.frame 并将其扩展以重新创建原始人口。如果您事先知道列名,这就很容易了。
library(tidyverse)
set.seed(121)
test_counts <- tibble(Population = letters[1:4], Length = c(1,1,2,1),
Number = sample(1:100, 4))
expand_counts_v0 <- function(Length, Population, Number) {
tibble(Population = Population,
Length = rep(Length, times = Number))
}
test_counts %>% pmap_dfr(expand_counts_v0) %>% # apply it
group_by(Population, Length) %>% # test it
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
但是,我想将它概括为一个不需要知道 data.frame 的列名的函数,并且我对 NSE 很感兴趣,所以我写道:
test_counts1 <- tibble(Population = letters[1:4],
Length = c(1,1,2,1),
Number = sample(1:100, 4),
Height = c(100, 50, 45, 90),
Width = c(700, 50, 60, 90)
)
expand_counts_v1 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% pmap_dfr(make_tbl)
}
但是,当我测试这个函数时,它似乎重复了 4 次行:
test_counts %>% expand_counts_v1(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ sum(.$Number)/sum(test_counts$Number)}
# [1] 4
这让我猜到了一个解决方案,即
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% make_tbl
}
这似乎有效:
test_counts %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
test_counts1 %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length, Height, Width) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts1)}
# [1] TRUE
但我不明白为什么。即使我不再使用 pmap,它如何评估每一行?该函数需要应用于每一行才能工作,所以它必须以某种方式,但我看不出它是如何做到的。
编辑
在 Artem 对发生的事情做出正确解释后,我意识到我可以做到这一点
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
expr(tibble(!!!cols)) %>% eval_tidy(data = df)
}
去掉了不必要的 mk_tbl 函数。然而,正如 Artem 所说,这只是真正有效,因为 rep 是矢量化的。所以,它正在工作,但不是通过重写 _v0 函数并将其映射,这是我试图复制的过程。最终,我发现,rlang::new_function 并写道:
expand_counts_v3 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
all_names <- df %>% names %>% map(as.name)
args <- rep(0, times = length(all_names)) %>% as.list %>% set_names(all_names)
correct_function <- new_function(args, # this makes the function as in _v0
expr(tibble(!!!cols)) )
pmap_dfr(df, correct_function) # applies it as in _v0
}
它更长,可能更丑陋,但可以按照我最初想要的方式工作。
问题出在 eval( envir = df )
,它将整个数据框暴露给 make_tbl()
。请注意,您从不在 make_tbl()
中使用 ...
参数。相反,该函数有效地计算了
的等价物
with( df, tibble(Population = rep(Population, times = Number),
Length = rep(Length, times=Number)) )
不管你向它提供什么参数。当您通过 pmap_dfr()
调用该函数时,它基本上计算上述四次(每行一次)并逐行连接结果,导致您观察到的条目重复。当您删除 pmap_dfr()
时,该函数被调用一次,但由于 rep
本身是矢量化的(尝试执行 rep( test_counts$Population, test_counts$Number )
以了解我的意思),make_tbl()
计算整个结果一口气
我构建了一个似乎有效的函数,但我不明白为什么。
我最初的问题是获取包含人口计数的 data.frame 并将其扩展以重新创建原始人口。如果您事先知道列名,这就很容易了。
library(tidyverse)
set.seed(121)
test_counts <- tibble(Population = letters[1:4], Length = c(1,1,2,1),
Number = sample(1:100, 4))
expand_counts_v0 <- function(Length, Population, Number) {
tibble(Population = Population,
Length = rep(Length, times = Number))
}
test_counts %>% pmap_dfr(expand_counts_v0) %>% # apply it
group_by(Population, Length) %>% # test it
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
但是,我想将它概括为一个不需要知道 data.frame 的列名的函数,并且我对 NSE 很感兴趣,所以我写道:
test_counts1 <- tibble(Population = letters[1:4],
Length = c(1,1,2,1),
Number = sample(1:100, 4),
Height = c(100, 50, 45, 90),
Width = c(700, 50, 60, 90)
)
expand_counts_v1 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% pmap_dfr(make_tbl)
}
但是,当我测试这个函数时,它似乎重复了 4 次行:
test_counts %>% expand_counts_v1(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ sum(.$Number)/sum(test_counts$Number)}
# [1] 4
这让我猜到了一个解决方案,即
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% make_tbl
}
这似乎有效:
test_counts %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
test_counts1 %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length, Height, Width) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts1)}
# [1] TRUE
但我不明白为什么。即使我不再使用 pmap,它如何评估每一行?该函数需要应用于每一行才能工作,所以它必须以某种方式,但我看不出它是如何做到的。
编辑
在 Artem 对发生的事情做出正确解释后,我意识到我可以做到这一点
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
expr(tibble(!!!cols)) %>% eval_tidy(data = df)
}
去掉了不必要的 mk_tbl 函数。然而,正如 Artem 所说,这只是真正有效,因为 rep 是矢量化的。所以,它正在工作,但不是通过重写 _v0 函数并将其映射,这是我试图复制的过程。最终,我发现,rlang::new_function 并写道:
expand_counts_v3 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
all_names <- df %>% names %>% map(as.name)
args <- rep(0, times = length(all_names)) %>% as.list %>% set_names(all_names)
correct_function <- new_function(args, # this makes the function as in _v0
expr(tibble(!!!cols)) )
pmap_dfr(df, correct_function) # applies it as in _v0
}
它更长,可能更丑陋,但可以按照我最初想要的方式工作。
问题出在 eval( envir = df )
,它将整个数据框暴露给 make_tbl()
。请注意,您从不在 make_tbl()
中使用 ...
参数。相反,该函数有效地计算了
with( df, tibble(Population = rep(Population, times = Number),
Length = rep(Length, times=Number)) )
不管你向它提供什么参数。当您通过 pmap_dfr()
调用该函数时,它基本上计算上述四次(每行一次)并逐行连接结果,导致您观察到的条目重复。当您删除 pmap_dfr()
时,该函数被调用一次,但由于 rep
本身是矢量化的(尝试执行 rep( test_counts$Population, test_counts$Number )
以了解我的意思),make_tbl()
计算整个结果一口气