按组引导 data.table 中的 2 个指标
bootstrapping of 2 metrics in a data.table by group
我正在尝试按组在 data.table 中提升 2 个指标。
library(data.table)
library(boot)
library(moments)
mystat2 <- function(data, i) {
m <- mean(data[i])
s <- skewness(data[i])
list(m = m, s = s)
}
AAval_len <- 10
ABval_len <- 15
BDval_len <- 8
AAval <- rnorm(AAval_len)
ABval <- rnorm(ABval_len, mean = 2)
BDval <- rnorm(BDval_len, mean = 3)
dt <- data.table(val = c(AAval, ABval, BDval ),
group = c(rep("A", AAval_len), rep("A", ABval_len), rep("B", BDval_len)),
subgroup = c(rep("A", AAval_len), rep("B", ABval_len), rep("D", BDval_len)))
dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]
预期结果是 data.table 类似于以下内容:
expected <- data.table(group = c("A", "A", "B" ),
subgroup = c("A", "B", "D"),
level = c(0.95, 0.95, 0.95),
mean_percentile_low = c(-0.1, 1.9, 2.9),
mean_percentile_high = c(0.1, 2.1, 3.1),
skew_percentile_low = c(-0.6, -0.6, -0.6),
skew_percentile_high = c(0.6, 0.6, 0.6))
所以我遇到了两个问题。
首先,引导函数有问题,我的统计函数返回 2 个指标而不是 1 个:
> dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]
Error in t.star[r, ] <- res[[r]] :
incorrect number of subscripts on matrix
其次,我的 data.table 在汇总结果时遇到问题:
> dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]
Error in `[.data.table`(dt, , boot.ci(boot(val, mystat2, R = 1000), cconf = 0.95, :
All items in j=list(...) should be atomic vectors or lists. If you are trying something like j=list(.SD,newcol=mean(colA)) then use := by group instead (much quicker), or cbind or merge afterwards.
很确定我可以用 For 循环处理它,但希望有一个更优雅的解决方案...
编辑:
我很接近但不在那里,使用下面的代码我只能查看一个统计数据并且没有正确命名列...
mystat3 <- function(data, i) {
m <- mean(data[i])
s <- skewness(data[i])
list(m = m, s = s)
m
}
dt[, {
tmp <- .SD[, boot.ci(boot(val, mystat3, R = 1e3), conf = 0.95, type = "perc")]
l <- list(level = tmp$percent[1], low <- tmp$percent[4], high <- tmp$percent[5])
}, by = list(group, subgroup)]
您可以进行以下调整:
- 您的函数
mystat2
应该 return 统计向量,而不是列表。更改如下
mystat2 <- function(data, i) {
c(mean(data[i]), skewness(data[i]))
}
- 你可以在每个统计数据上单独使用
boot.ci
,使用index
参数,然后收集conf级别和低值和高值,return所有五个在一个命名列表。您可以在像这样的函数中执行所有这些操作,包括初始 bootstrap:
get_boot_ci <- function(val) {
b_out = boot(val,mystat2,R=1e3)
ci_mean = boot.ci(b_out, type="perc", index=1)$percent[c(1,4:5)]
ci_skew = boot.ci(b_out, type="perc", index=2)$percent[c(1,4:5)]
list("level" = ci_mean[1],
"mean_percentile_low" = ci_mean[2],
"mean_percentile_high" = ci_mean[3],
"skew_percentile_low" = ci_skew[2],
"skew_percentile_high" = ci_skew[3]
)
}
- 现在,只需将上述函数应用到您感兴趣的组
val
列
dt[, get_boot_ci(val), by=.(group, subgroup)]
输出:
group subgroup level mean_percentile_low mean_percentile_high skew_percentile_low skew_percentile_high
1: A A 0.95 -0.7117178 0.2915394 -1.5501161 1.4951826
2: A B 0.95 1.4666828 2.4285060 -2.2816065 0.5033563
3: B D 0.95 2.4914297 3.3688430 -0.5834545 1.6190134
我正在尝试按组在 data.table 中提升 2 个指标。
library(data.table)
library(boot)
library(moments)
mystat2 <- function(data, i) {
m <- mean(data[i])
s <- skewness(data[i])
list(m = m, s = s)
}
AAval_len <- 10
ABval_len <- 15
BDval_len <- 8
AAval <- rnorm(AAval_len)
ABval <- rnorm(ABval_len, mean = 2)
BDval <- rnorm(BDval_len, mean = 3)
dt <- data.table(val = c(AAval, ABval, BDval ),
group = c(rep("A", AAval_len), rep("A", ABval_len), rep("B", BDval_len)),
subgroup = c(rep("A", AAval_len), rep("B", ABval_len), rep("D", BDval_len)))
dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]
预期结果是 data.table 类似于以下内容:
expected <- data.table(group = c("A", "A", "B" ),
subgroup = c("A", "B", "D"),
level = c(0.95, 0.95, 0.95),
mean_percentile_low = c(-0.1, 1.9, 2.9),
mean_percentile_high = c(0.1, 2.1, 3.1),
skew_percentile_low = c(-0.6, -0.6, -0.6),
skew_percentile_high = c(0.6, 0.6, 0.6))
所以我遇到了两个问题。
首先,引导函数有问题,我的统计函数返回 2 个指标而不是 1 个:
> dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]
Error in t.star[r, ] <- res[[r]] :
incorrect number of subscripts on matrix
其次,我的 data.table 在汇总结果时遇到问题:
> dt[, boot.ci(boot(val, mystat2, R = 1e3), cconf = 0.95, type = "perc"), by = list(group, subgroup)]
Error in `[.data.table`(dt, , boot.ci(boot(val, mystat2, R = 1000), cconf = 0.95, :
All items in j=list(...) should be atomic vectors or lists. If you are trying something like j=list(.SD,newcol=mean(colA)) then use := by group instead (much quicker), or cbind or merge afterwards.
很确定我可以用 For 循环处理它,但希望有一个更优雅的解决方案...
编辑:
我很接近但不在那里,使用下面的代码我只能查看一个统计数据并且没有正确命名列...
mystat3 <- function(data, i) {
m <- mean(data[i])
s <- skewness(data[i])
list(m = m, s = s)
m
}
dt[, {
tmp <- .SD[, boot.ci(boot(val, mystat3, R = 1e3), conf = 0.95, type = "perc")]
l <- list(level = tmp$percent[1], low <- tmp$percent[4], high <- tmp$percent[5])
}, by = list(group, subgroup)]
您可以进行以下调整:
- 您的函数
mystat2
应该 return 统计向量,而不是列表。更改如下
mystat2 <- function(data, i) {
c(mean(data[i]), skewness(data[i]))
}
- 你可以在每个统计数据上单独使用
boot.ci
,使用index
参数,然后收集conf级别和低值和高值,return所有五个在一个命名列表。您可以在像这样的函数中执行所有这些操作,包括初始 bootstrap:
get_boot_ci <- function(val) {
b_out = boot(val,mystat2,R=1e3)
ci_mean = boot.ci(b_out, type="perc", index=1)$percent[c(1,4:5)]
ci_skew = boot.ci(b_out, type="perc", index=2)$percent[c(1,4:5)]
list("level" = ci_mean[1],
"mean_percentile_low" = ci_mean[2],
"mean_percentile_high" = ci_mean[3],
"skew_percentile_low" = ci_skew[2],
"skew_percentile_high" = ci_skew[3]
)
}
- 现在,只需将上述函数应用到您感兴趣的组
val
列
dt[, get_boot_ci(val), by=.(group, subgroup)]
输出:
group subgroup level mean_percentile_low mean_percentile_high skew_percentile_low skew_percentile_high
1: A A 0.95 -0.7117178 0.2915394 -1.5501161 1.4951826
2: A B 0.95 1.4666828 2.4285060 -2.2816065 0.5033563
3: B D 0.95 2.4914297 3.3688430 -0.5834545 1.6190134