如何使用 tidyverse 编写自举均值差?

How to write a bootstrapped mean difference using tidyverse?

我对根据二元协变量的某些变量的均值差异感兴趣。

我正在通过引导计算此差异的置信区间:

library(tidyverse)
df = mtcars %>% 
   select(disp, vs) %>% 
   mutate(vs=factor(vs, labels=c("vshaped", "straight")))
by1="straight"
by2="vshaped"
R=1000

set.seed(1)
beffect = numeric(length=R)
for (i in 1:R) {
    ib = sample(1:nrow(df), replace = TRUE)
    xi = df$disp[ib]
    byi = df$vs[ib]
    beffect[i] = mean(xi[byi==by2], na.rm = TRUE) - mean(xi[byi==by1], na.rm = TRUE)
}
mean(beffect)
#> [1] 175.9203
sd(beffect)
#> [1] 29.3409

reprex package (v2.0.0)

于 2021 年 6 月 13 日创建

这行得通,但我发现它很难读,我想知道它的效率,因为 for 循环在 R 中通常被认为是一个糟糕的设计。

作为 tidyverse 的重度用户,我想使用这个框架重写它。

是否有快速且可读的方法?

PS: 这是我能得到的最接近的,但它远没有提高可读性,而且慢了 250 倍:

beffect2 = replicate(R, {
    df %>% 
        slice_sample(prop=1, replace = TRUE) %>% 
        group_by(vs) %>% 
        summarise(m=mean(disp)) %>% 
        pivot_wider(names_from = "vs", values_from = "m") %>%
        transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE) %>% 
    map_dbl(identity)

编辑:这里是目前所有方法的基准:


# with R=50 ***********

# microbenchmark::microbenchmark(f_dc(50), f_akrun(50), f_akrun_diff(50), f_akrun_bindout(50), f_cole(50), f_forloop(50), times = 5)
# Unit: milliseconds
#               expr      min       lq      mean   median       uq      max neval
#             f_dc() 497.4559 524.9582 560.94690 553.6271 572.2261 656.4672     5
#          f_akrun() 101.6295 108.5232 111.22400 110.7238 111.4105 123.8330     5
#     f_akrun_diff() 270.0261 283.3257 308.92806 283.6411 314.7233 392.9241     5
#  f_akrun_bindout()  21.8185  21.9725  76.68770  22.9811  30.2129 286.4535     5
#           f_cole()   2.7685   3.1343   3.63484   3.2679   4.4346   4.5689     5
#        f_forloop()   2.1136   2.1277   3.14156   3.4968   3.6740   4.2957     5

# with R=500 **********

# microbenchmark::microbenchmark(f_dc(500), f_akrun(500), f_akrun_diff(500), f_akrun_bindout(500), f_cole(500), f_forloop(500), times = 5)
# Unit: milliseconds
#               expr       min        lq       mean    median        uq       max neval
#             f_dc() 4270.2451 4535.4618 4543.85930 4539.3032 4613.5823 4760.7041     5
#          f_akrun()  936.3249  951.3230  970.27424  956.3674  992.3162 1015.0397     5
#     f_akrun_diff() 2501.3871 2509.5429 2589.47288 2608.5254 2649.3819 2678.5271     5
#  f_akrun_bindout()  108.3761  108.7238  113.26746  112.2521  118.4673  118.5180     5
#           f_cole()   23.1283   23.4074   24.75386   23.9244   26.4594   26.8498     5
#        f_forloop()   20.4243   21.1367   23.26222   21.2130   22.5616   30.9755     5

我们可以使用 map 并避免多个 pivot_wider 步骤

library(purrr)
library(dplyr)

set.seed(1)
out <- map_dfr(seq_len(R), ~ {
         ib <- sample(1:nrow(df), replace = TRUE)
         df %>% 
            slice(ib) %>%
            summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
                  mean(disp[vs == by1], na.rm = TRUE))
      })

-检查中

mean(out$beffect)
#[1] 175.9203
sd(out$beffect)
#[1] 29.3409

或者可以使用 diff 而不是 pivot_wider

set.seed(1)
out2 <- replicate(R, df %>% 
       slice_sample(prop = 1, replace = TRUE) %>%
       group_by(vs) %>%
       summarise(m = mean(disp), .groups = 'drop') %>% 
       summarise(beffect = diff(m[2:1])), simplify = FALSE) %>% 
       bind_rows

-检查中

mean(out2$beffect)
#[1] 175.9203

或者另一种选择是执行 sample,将它们与组标识符绑定在一起,使用它来提取列的值,按组标识符和 'vs' 进行分组并得到 mean

set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
   as_tibble, simplify = FALSE) %>%
   bind_rows(.id = 'grp') %>%
   mutate(vs = df$vs[value], disp = df$disp[value]) %>% 
   group_by(grp, vs) %>% 
   summarise(beffect = mean(disp), .groups = 'drop_last') %>% 
   group_by(grp) %>% 
   summarise(beffect = diff(beffect[2:1]), .groups = 'drop')

-检查中

mean(out3$beffect)
#[1] 175.9203

基准

system.time({set.seed(1)
 out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
    as_tibble, simplify = FALSE) %>%
    bind_rows(.id = 'grp') %>%
    mutate(vs = df$vs[value], disp = df$disp[value]) %>% 
    group_by(grp, vs) %>% 
    summarise(beffect = mean(disp), .groups = 'drop_last') %>% 
    group_by(grp) %>% 
    summarise(beffect = diff(beffect[2:1]), .groups = 'drop')})
 #  user  system elapsed 
 # 0.202   0.007   0.208 

或者用map

system.time({
  set.seed(1)
 out <- map_dfr(seq_len(R), ~ {
          ib <- sample(1:nrow(df), replace = TRUE)
          df %>% 
             slice(ib) %>%
             summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
                   mean(disp[vs == by1], na.rm = TRUE))
       })
 })
#   user  system elapsed 
#  1.329   0.013   1.338 

或者代替 pivot_wider,取 diff

system.time({set.seed(1)
    out2 <- replicate(R, df %>% 
       slice_sample(prop = 1, replace = TRUE) %>%
       group_by(vs) %>%
       summarise(m = mean(disp), .groups = 'drop') %>% 
       summarise(beffect = diff(m[2:1])), simplify = FALSE) %>% 
       bind_rows
     })
 #  user  system elapsed 
 # 3.753   0.027   3.758 

data.table

中的类似方法
library(data.table)
system.time({
setDT(df)
set.seed(1)
out3 <- rbindlist(
        replicate(R,  

              df[df[, .I[sample(seq_len(.N), replace = TRUE)]
                        ]][, .(m = mean(disp)), vs][, .(beffect = m[2]- m[1])],
                simplify = FALSE)
                         )
})
# user  system elapsed 
#  1.181   0.055   1.230 

-OP的方法

system.time({replicate(R, {
     df %>% 
         slice_sample(prop=1, replace = TRUE) %>% 
         group_by(vs) %>% 
         summarise(m=mean(disp)) %>% 
         pivot_wider(names_from = "vs", values_from = "m") %>%
         transmute(x=!!ensym(by2) - !!ensym(by1))
 }, simplify = FALSE)})
   user  system elapsed 
  6.991   0.063   7.009 
microbenchmark::microbenchmark(f_dc(), f_akrun1(), f_akrun2(), f_akrun3(), f_forloop(), times = 5)
Unit: milliseconds
        expr        min         lq      mean     median         uq        max neval  cld
      f_dc() 6453.14052 6512.34196 6772.0079 6534.08171 6939.61358 7420.86152     5    d
  f_akrun1() 1288.96812 1328.96075 1377.0833 1353.79346 1372.30852 1541.38573     5  b  
  f_akrun2() 3685.33619 3703.33018 3814.8367 3801.52657 3915.75432 3968.23609     5   c 
  f_akrun3()  178.30997  179.77604  194.0712  189.18425  205.37485  217.71095     5 a   
 f_forloop()   30.11329   33.37171   35.0534   36.80903   36.95909   38.01389     5 a   

这可能忽略了显而易见的东西,但是 等价于 for 循环会涉及 purrr::map() 之类的东西。最简单的转换是使用 purrr::map_dbl(1:R, ...) 例如:

library(purrr)
## better for memory and performance to extract vectors ahead of loop
disp = dt$disp
vs = dt$vs

map_dbl(1:R,
        ~ {
          ib = sample(nrow(df), replace = TRUE)
          xi = disp[ib]
          byi = vs[ib]
          mean(xi[byi == by2], na.rm = TRUE) - mean(xi[byi == by1], na.rm = TRUE)
          })

此外,由于 by 是二进制的,您可以通过将其转换为 .

来提高性能