如何使用 tidyverse 编写自举均值差?
How to write a bootstrapped mean difference using tidyverse?
我对根据二元协变量的某些变量的均值差异感兴趣。
我正在通过引导计算此差异的置信区间:
library(tidyverse)
df = mtcars %>%
select(disp, vs) %>%
mutate(vs=factor(vs, labels=c("vshaped", "straight")))
by1="straight"
by2="vshaped"
R=1000
set.seed(1)
beffect = numeric(length=R)
for (i in 1:R) {
ib = sample(1:nrow(df), replace = TRUE)
xi = df$disp[ib]
byi = df$vs[ib]
beffect[i] = mean(xi[byi==by2], na.rm = TRUE) - mean(xi[byi==by1], na.rm = TRUE)
}
mean(beffect)
#> [1] 175.9203
sd(beffect)
#> [1] 29.3409
由 reprex package (v2.0.0)
于 2021 年 6 月 13 日创建
这行得通,但我发现它很难读,我想知道它的效率,因为 for
循环在 R
中通常被认为是一个糟糕的设计。
作为 tidyverse
的重度用户,我想使用这个框架重写它。
是否有快速且可读的方法?
PS: 这是我能得到的最接近的,但它远没有提高可读性,而且慢了 250 倍:
beffect2 = replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE) %>%
map_dbl(identity)
编辑:这里是目前所有方法的基准:
# with R=50 ***********
# microbenchmark::microbenchmark(f_dc(50), f_akrun(50), f_akrun_diff(50), f_akrun_bindout(50), f_cole(50), f_forloop(50), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 497.4559 524.9582 560.94690 553.6271 572.2261 656.4672 5
# f_akrun() 101.6295 108.5232 111.22400 110.7238 111.4105 123.8330 5
# f_akrun_diff() 270.0261 283.3257 308.92806 283.6411 314.7233 392.9241 5
# f_akrun_bindout() 21.8185 21.9725 76.68770 22.9811 30.2129 286.4535 5
# f_cole() 2.7685 3.1343 3.63484 3.2679 4.4346 4.5689 5
# f_forloop() 2.1136 2.1277 3.14156 3.4968 3.6740 4.2957 5
# with R=500 **********
# microbenchmark::microbenchmark(f_dc(500), f_akrun(500), f_akrun_diff(500), f_akrun_bindout(500), f_cole(500), f_forloop(500), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 4270.2451 4535.4618 4543.85930 4539.3032 4613.5823 4760.7041 5
# f_akrun() 936.3249 951.3230 970.27424 956.3674 992.3162 1015.0397 5
# f_akrun_diff() 2501.3871 2509.5429 2589.47288 2608.5254 2649.3819 2678.5271 5
# f_akrun_bindout() 108.3761 108.7238 113.26746 112.2521 118.4673 118.5180 5
# f_cole() 23.1283 23.4074 24.75386 23.9244 26.4594 26.8498 5
# f_forloop() 20.4243 21.1367 23.26222 21.2130 22.5616 30.9755 5
我们可以使用 map
并避免多个 pivot_wider
步骤
library(purrr)
library(dplyr)
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
-检查中
mean(out$beffect)
#[1] 175.9203
sd(out$beffect)
#[1] 29.3409
或者可以使用 diff
而不是 pivot_wider
set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
-检查中
mean(out2$beffect)
#[1] 175.9203
或者另一种选择是执行 sample
,将它们与组标识符绑定在一起,使用它来提取列的值,按组标识符和 'vs' 进行分组并得到 mean
set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')
-检查中
mean(out3$beffect)
#[1] 175.9203
基准
system.time({set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')})
# user system elapsed
# 0.202 0.007 0.208
或者用map
system.time({
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
})
# user system elapsed
# 1.329 0.013 1.338
或者代替 pivot_wider
,取 diff
system.time({set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
})
# user system elapsed
# 3.753 0.027 3.758
或 data.table
中的类似方法
library(data.table)
system.time({
setDT(df)
set.seed(1)
out3 <- rbindlist(
replicate(R,
df[df[, .I[sample(seq_len(.N), replace = TRUE)]
]][, .(m = mean(disp)), vs][, .(beffect = m[2]- m[1])],
simplify = FALSE)
)
})
# user system elapsed
# 1.181 0.055 1.230
-OP的方法
system.time({replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE)})
user system elapsed
6.991 0.063 7.009
microbenchmark::microbenchmark(f_dc(), f_akrun1(), f_akrun2(), f_akrun3(), f_forloop(), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
f_dc() 6453.14052 6512.34196 6772.0079 6534.08171 6939.61358 7420.86152 5 d
f_akrun1() 1288.96812 1328.96075 1377.0833 1353.79346 1372.30852 1541.38573 5 b
f_akrun2() 3685.33619 3703.33018 3814.8367 3801.52657 3915.75432 3968.23609 5 c
f_akrun3() 178.30997 179.77604 194.0712 189.18425 205.37485 217.71095 5 a
f_forloop() 30.11329 33.37171 35.0534 36.80903 36.95909 38.01389 5 a
这可能忽略了显而易见的东西,但是 tidyverse 等价于 for 循环会涉及 purrr::map()
之类的东西。最简单的转换是使用 purrr::map_dbl(1:R, ...)
例如:
library(purrr)
## better for memory and performance to extract vectors ahead of loop
disp = dt$disp
vs = dt$vs
map_dbl(1:R,
~ {
ib = sample(nrow(df), replace = TRUE)
xi = disp[ib]
byi = vs[ib]
mean(xi[byi == by2], na.rm = TRUE) - mean(xi[byi == by1], na.rm = TRUE)
})
此外,由于 by
是二进制的,您可以通过将其转换为 rcpp.
来提高性能
我对根据二元协变量的某些变量的均值差异感兴趣。
我正在通过引导计算此差异的置信区间:
library(tidyverse)
df = mtcars %>%
select(disp, vs) %>%
mutate(vs=factor(vs, labels=c("vshaped", "straight")))
by1="straight"
by2="vshaped"
R=1000
set.seed(1)
beffect = numeric(length=R)
for (i in 1:R) {
ib = sample(1:nrow(df), replace = TRUE)
xi = df$disp[ib]
byi = df$vs[ib]
beffect[i] = mean(xi[byi==by2], na.rm = TRUE) - mean(xi[byi==by1], na.rm = TRUE)
}
mean(beffect)
#> [1] 175.9203
sd(beffect)
#> [1] 29.3409
由 reprex package (v2.0.0)
于 2021 年 6 月 13 日创建这行得通,但我发现它很难读,我想知道它的效率,因为 for
循环在 R
中通常被认为是一个糟糕的设计。
作为 tidyverse
的重度用户,我想使用这个框架重写它。
是否有快速且可读的方法?
PS: 这是我能得到的最接近的,但它远没有提高可读性,而且慢了 250 倍:
beffect2 = replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE) %>%
map_dbl(identity)
编辑:这里是目前所有方法的基准:
# with R=50 ***********
# microbenchmark::microbenchmark(f_dc(50), f_akrun(50), f_akrun_diff(50), f_akrun_bindout(50), f_cole(50), f_forloop(50), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 497.4559 524.9582 560.94690 553.6271 572.2261 656.4672 5
# f_akrun() 101.6295 108.5232 111.22400 110.7238 111.4105 123.8330 5
# f_akrun_diff() 270.0261 283.3257 308.92806 283.6411 314.7233 392.9241 5
# f_akrun_bindout() 21.8185 21.9725 76.68770 22.9811 30.2129 286.4535 5
# f_cole() 2.7685 3.1343 3.63484 3.2679 4.4346 4.5689 5
# f_forloop() 2.1136 2.1277 3.14156 3.4968 3.6740 4.2957 5
# with R=500 **********
# microbenchmark::microbenchmark(f_dc(500), f_akrun(500), f_akrun_diff(500), f_akrun_bindout(500), f_cole(500), f_forloop(500), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 4270.2451 4535.4618 4543.85930 4539.3032 4613.5823 4760.7041 5
# f_akrun() 936.3249 951.3230 970.27424 956.3674 992.3162 1015.0397 5
# f_akrun_diff() 2501.3871 2509.5429 2589.47288 2608.5254 2649.3819 2678.5271 5
# f_akrun_bindout() 108.3761 108.7238 113.26746 112.2521 118.4673 118.5180 5
# f_cole() 23.1283 23.4074 24.75386 23.9244 26.4594 26.8498 5
# f_forloop() 20.4243 21.1367 23.26222 21.2130 22.5616 30.9755 5
我们可以使用 map
并避免多个 pivot_wider
步骤
library(purrr)
library(dplyr)
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
-检查中
mean(out$beffect)
#[1] 175.9203
sd(out$beffect)
#[1] 29.3409
或者可以使用 diff
而不是 pivot_wider
set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
-检查中
mean(out2$beffect)
#[1] 175.9203
或者另一种选择是执行 sample
,将它们与组标识符绑定在一起,使用它来提取列的值,按组标识符和 'vs' 进行分组并得到 mean
set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')
-检查中
mean(out3$beffect)
#[1] 175.9203
基准
system.time({set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')})
# user system elapsed
# 0.202 0.007 0.208
或者用map
system.time({
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
})
# user system elapsed
# 1.329 0.013 1.338
或者代替 pivot_wider
,取 diff
system.time({set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
})
# user system elapsed
# 3.753 0.027 3.758
或 data.table
library(data.table)
system.time({
setDT(df)
set.seed(1)
out3 <- rbindlist(
replicate(R,
df[df[, .I[sample(seq_len(.N), replace = TRUE)]
]][, .(m = mean(disp)), vs][, .(beffect = m[2]- m[1])],
simplify = FALSE)
)
})
# user system elapsed
# 1.181 0.055 1.230
-OP的方法
system.time({replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE)})
user system elapsed
6.991 0.063 7.009
microbenchmark::microbenchmark(f_dc(), f_akrun1(), f_akrun2(), f_akrun3(), f_forloop(), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
f_dc() 6453.14052 6512.34196 6772.0079 6534.08171 6939.61358 7420.86152 5 d
f_akrun1() 1288.96812 1328.96075 1377.0833 1353.79346 1372.30852 1541.38573 5 b
f_akrun2() 3685.33619 3703.33018 3814.8367 3801.52657 3915.75432 3968.23609 5 c
f_akrun3() 178.30997 179.77604 194.0712 189.18425 205.37485 217.71095 5 a
f_forloop() 30.11329 33.37171 35.0534 36.80903 36.95909 38.01389 5 a
这可能忽略了显而易见的东西,但是 tidyverse 等价于 for 循环会涉及 purrr::map()
之类的东西。最简单的转换是使用 purrr::map_dbl(1:R, ...)
例如:
library(purrr)
## better for memory and performance to extract vectors ahead of loop
disp = dt$disp
vs = dt$vs
map_dbl(1:R,
~ {
ib = sample(nrow(df), replace = TRUE)
xi = disp[ib]
byi = vs[ib]
mean(xi[byi == by2], na.rm = TRUE) - mean(xi[byi == by1], na.rm = TRUE)
})
此外,由于 by
是二进制的,您可以通过将其转换为 rcpp.