按组引导 data.table 中的 2 个指标

bootstrapping of 2 metrics in a data.table by group

我正在尝试按组在 data.table 中提升 2 个指标。

library(data.table)
library(boot)
library(moments)

mystat2 <- function(data, i) {
  m <- mean(data[i])
  s <- skewness(data[i])
  list(m = m, s = s)
}


AAval_len <- 10
ABval_len <- 15
BDval_len <- 8
AAval <- rnorm(AAval_len)
ABval <- rnorm(ABval_len, mean = 2)
BDval <- rnorm(BDval_len, mean = 3)
dt <- data.table(val = c(AAval, ABval, BDval ),
                 group = c(rep("A", AAval_len),  rep("A", ABval_len), rep("B", BDval_len)),
                 subgroup = c(rep("A", AAval_len),  rep("B", ABval_len), rep("D", BDval_len)))


dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]

预期结果是 data.table 类似于以下内容:

expected <- data.table(group = c("A", "A", "B" ),
                       subgroup = c("A", "B", "D"),
                       level = c(0.95, 0.95, 0.95),
                       mean_percentile_low = c(-0.1, 1.9, 2.9),
                       mean_percentile_high = c(0.1, 2.1, 3.1),
                       skew_percentile_low = c(-0.6, -0.6, -0.6),
                       skew_percentile_high = c(0.6, 0.6, 0.6))

所以我遇到了两个问题。
首先,引导函数有问题,我的统计函数返回 2 个指标而不是 1 个:

> dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]
Error in t.star[r, ] <- res[[r]] : 
  incorrect number of subscripts on matrix

其次,我的 data.table 在汇总结果时遇到问题:

> dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]
Error in `[.data.table`(dt, , boot.ci(boot(val, mystat2, R = 1000), cconf = 0.95,  : 
  All items in j=list(...) should be atomic vectors or lists. If you are trying something like j=list(.SD,newcol=mean(colA)) then use := by group instead (much quicker), or cbind or merge afterwards.

很确定我可以用 For 循环处理它,但希望有一个更优雅的解决方案...

编辑:

我很接近但不在那里,使用下面的代码我只能查看一个统计数据并且没有正确命名列...

mystat3 <- function(data, i) {
  m <- mean(data[i])
  s <- skewness(data[i])
  list(m = m, s = s)
  m
}
dt[, { 
  tmp <-  .SD[, boot.ci(boot(val, mystat3, R = 1e3), conf = 0.95, type = "perc")]
  l <- list(level = tmp$percent[1], low <- tmp$percent[4], high <- tmp$percent[5])
  }, by = list(group, subgroup)]

您可以进行以下调整:

  1. 您的函数 mystat2 应该 return 统计向量,而不是列表。更改如下
mystat2 <- function(data, i) {
  c(mean(data[i]), skewness(data[i]))
}
  1. 你可以在每个统计数据上单独使用boot.ci,使用index参数,然后收集conf级别和低值和高值,return所有五个在一个命名列表。您可以在像这样的函数中执行所有这些操作,包括初始 bootstrap:
get_boot_ci <- function(val) {
  b_out = boot(val,mystat2,R=1e3)
  ci_mean = boot.ci(b_out, type="perc", index=1)$percent[c(1,4:5)]
  ci_skew = boot.ci(b_out, type="perc", index=2)$percent[c(1,4:5)]
  list("level" = ci_mean[1],
       "mean_percentile_low" = ci_mean[2],
       "mean_percentile_high" = ci_mean[3],
       "skew_percentile_low" = ci_skew[2],
       "skew_percentile_high" = ci_skew[3]
  )
}
  1. 现在,只需将上述函数应用到您感兴趣的组val
dt[, get_boot_ci(val), by=.(group, subgroup)]

输出:

   group subgroup level mean_percentile_low mean_percentile_high skew_percentile_low skew_percentile_high
1:     A        A  0.95          -0.7117178            0.2915394          -1.5501161            1.4951826
2:     A        B  0.95           1.4666828            2.4285060          -2.2816065            0.5033563
3:     B        D  0.95           2.4914297            3.3688430          -0.5834545            1.6190134