R - 快速双样本 t 检验
R - fast two sample t test
我想在 R 中使用单独的分组执行两个样本 t 检验。 t.test 必须是 "unbiased",这意味着对于外部组(下面的组 2)中的所有事务,每个内部组(下面的组 1)的 T 测试必须是 运行,例如: "inner group A" 对比 "inner group not A"。下面显示的 for
循环代码可能比口头解释更清楚...
我当前的代码如下。有谁知道 faster/better 的方法吗?开放使用任何包,但目前使用 data.table
.
就上下文而言,我有大约 100 万行交易数据。第 1 组表示一个人(如果有多行,他们有多个事务)并包含 ~30k 唯一值。第 2 组表示邮政编码并包含约 500 个唯一值
谢谢!
library(data.table)
# fake data
grp1 <- c('A','A','A','B','B','C','C','D','D','D','D','E','E','E','F','F')
grp2 <- c(1,1,1,1,1,1,1, 2,2,2,2,2,2,2, 2,2)
vals <- c(10,20,30, 40,15, 25,60, 70,100,200,300, 400,1000,2000, 3000,5000)
DT <- data.table(grp1 = grp1, grp2 = grp2, vals = vals)
# "two sample t.test" --------------------------------------------------
# non vectorized, in-place
# runtime is ~50 mins for real data
for (z in DT[,unique(grp2)]){
for (c in DT[grp2 == z, unique(grp1)]) {
res = t.test(
DT[grp2 == z & grp1 == c, vals],
DT[grp2 == z & grp1 != c, vals],
alternative = 'greater'
)
DT[grp2 == z & grp1 == c, pval := res$p.value]
DT[grp2 == z & grp1 == c, tstat := res$statistic]
}
}
# vectorized, creates new summarized data.table
# runtime is 1-2 mins on real data
vec <- DT[,{
grp2_vector = vals
.SD[,.(tstat = t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')$statistic,
pval = t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')$p.value), by=grp1]
} , by=grp2]
stats::t.test
被概括并进行了一些检查。您可以只计算您需要的,即 t 统计量和 p 值,还可以利用 data.table
中的优化来计算长度、均值和方差。这是一个可能的方法:
#combinations of grp1 and grp2 and those not in grp1 for each grp2
comb <- unique(DT[, .(grp1, grp2)])[,
rbindlist(lapply(1:.N, function(n) .(g1=rep(grp1[n], .N-1L), notIn=grp1[-n]))),
.(g2=grp2)]
#this is optimized, switch on verbose to see the output
X <- DT[, .(nx=.N, mx=mean(vals), vx=var(vals)), .(grp1, grp2)] #, verbose=TRUE]
#calculate length, mean, var for values not in grp1
Y <- DT[comb, on=.(grp2=g2, grp1=notIn), allow.cartesian=TRUE][,
.(ny=.N, my=mean(vals), vy=var(vals)), by=.(grp1=g1, grp2=grp2)]
#calculate outputs based on stats:::t.test.default
ans <- X[Y, on=.(grp1, grp2)][, c("tstat", "pval") := {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my)/stderr
.(tstat, pt(tstat, df, lower.tail = FALSE))
}, by=1:Y[,.N]]
输出:
grp1 grp2 nx mx vx ny my vy tstat pval
1: C 1 2 42.500 612.50 5 23.0000 145.0000 1.06500150 0.22800432
2: B 1 2 27.500 312.50 5 29.0000 355.0000 -0.09950372 0.53511601
3: A 1 3 20.000 100.00 4 35.0000 383.3333 -1.31982404 0.87570431
4: F 2 2 4000.000 2000000.00 7 581.4286 489747.6190 3.30491342 0.08072148
5: E 2 3 1133.333 653333.33 6 1445.0000 4323350.0000 -0.32174451 0.62141500
6: D 2 4 167.500 10891.67 5 2280.0000 3292000.0000 -2.59809850 0.97016160
计时代码:
library(data.table) #data.table_1.12.4
set.seed(0L)
np <- 4.2e5
nzc <- 4.2e3
DT <- data.table(grp1=rep(1:np, each=5), grp2=rep(1:nzc, each=np/nzc*5),
vals=abs(rnorm(np*5, 5000, 2000)), key=c("grp1", "grp2"))
mtd0 <- function() {
DT[, {
grp2_vector <- vals
.SD[,{
tres <- t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')
.(tstat=tres$statistic, pval=tres$p.value)
}, by=grp1]
} , by=grp2]
}
mtd1 <- function() {
comb <- unique(DT[, .(grp1, grp2)])[,
rbindlist(lapply(1:.N, function(n) .(g1=rep(grp1[n], .N-1L), notIn=grp1[-n]))),
.(g2=grp2)]
X <- DT[, .(nx=.N, mx=mean(vals), vx=var(vals)), .(grp1, grp2)] #, verbose=TRUE]
Y <- DT[comb, on=.(grp2=g2, grp1=notIn), allow.cartesian=TRUE][,
.(ny=.N, my=mean(vals), vy=var(vals)), by=.(grp1=g1, grp2=grp2)]
ans <- X[Y, on=.(grp1, grp2)][, c("tstat", "pval") := {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my)/stderr
.(tstat, pt(tstat, df, lower.tail = FALSE))
}, by=1:Y[,.N]]
}
microbenchmark::microbenchmark(mtd0(), mtd1(), times=1L)
时间:
Unit: seconds
expr min lq mean median uq max neval
mtd0() 65.76456 65.76456 65.76456 65.76456 65.76456 65.76456 1
mtd1() 18.29710 18.29710 18.29710 18.29710 18.29710 18.29710 1
我建议查看包 Rfast。有命令,例如ttest1,ttest2和ttests for one sample, 2 sample and many t
我想在 R 中使用单独的分组执行两个样本 t 检验。 t.test 必须是 "unbiased",这意味着对于外部组(下面的组 2)中的所有事务,每个内部组(下面的组 1)的 T 测试必须是 运行,例如: "inner group A" 对比 "inner group not A"。下面显示的 for
循环代码可能比口头解释更清楚...
我当前的代码如下。有谁知道 faster/better 的方法吗?开放使用任何包,但目前使用 data.table
.
就上下文而言,我有大约 100 万行交易数据。第 1 组表示一个人(如果有多行,他们有多个事务)并包含 ~30k 唯一值。第 2 组表示邮政编码并包含约 500 个唯一值
谢谢!
library(data.table)
# fake data
grp1 <- c('A','A','A','B','B','C','C','D','D','D','D','E','E','E','F','F')
grp2 <- c(1,1,1,1,1,1,1, 2,2,2,2,2,2,2, 2,2)
vals <- c(10,20,30, 40,15, 25,60, 70,100,200,300, 400,1000,2000, 3000,5000)
DT <- data.table(grp1 = grp1, grp2 = grp2, vals = vals)
# "two sample t.test" --------------------------------------------------
# non vectorized, in-place
# runtime is ~50 mins for real data
for (z in DT[,unique(grp2)]){
for (c in DT[grp2 == z, unique(grp1)]) {
res = t.test(
DT[grp2 == z & grp1 == c, vals],
DT[grp2 == z & grp1 != c, vals],
alternative = 'greater'
)
DT[grp2 == z & grp1 == c, pval := res$p.value]
DT[grp2 == z & grp1 == c, tstat := res$statistic]
}
}
# vectorized, creates new summarized data.table
# runtime is 1-2 mins on real data
vec <- DT[,{
grp2_vector = vals
.SD[,.(tstat = t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')$statistic,
pval = t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')$p.value), by=grp1]
} , by=grp2]
stats::t.test
被概括并进行了一些检查。您可以只计算您需要的,即 t 统计量和 p 值,还可以利用 data.table
中的优化来计算长度、均值和方差。这是一个可能的方法:
#combinations of grp1 and grp2 and those not in grp1 for each grp2
comb <- unique(DT[, .(grp1, grp2)])[,
rbindlist(lapply(1:.N, function(n) .(g1=rep(grp1[n], .N-1L), notIn=grp1[-n]))),
.(g2=grp2)]
#this is optimized, switch on verbose to see the output
X <- DT[, .(nx=.N, mx=mean(vals), vx=var(vals)), .(grp1, grp2)] #, verbose=TRUE]
#calculate length, mean, var for values not in grp1
Y <- DT[comb, on=.(grp2=g2, grp1=notIn), allow.cartesian=TRUE][,
.(ny=.N, my=mean(vals), vy=var(vals)), by=.(grp1=g1, grp2=grp2)]
#calculate outputs based on stats:::t.test.default
ans <- X[Y, on=.(grp1, grp2)][, c("tstat", "pval") := {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my)/stderr
.(tstat, pt(tstat, df, lower.tail = FALSE))
}, by=1:Y[,.N]]
输出:
grp1 grp2 nx mx vx ny my vy tstat pval
1: C 1 2 42.500 612.50 5 23.0000 145.0000 1.06500150 0.22800432
2: B 1 2 27.500 312.50 5 29.0000 355.0000 -0.09950372 0.53511601
3: A 1 3 20.000 100.00 4 35.0000 383.3333 -1.31982404 0.87570431
4: F 2 2 4000.000 2000000.00 7 581.4286 489747.6190 3.30491342 0.08072148
5: E 2 3 1133.333 653333.33 6 1445.0000 4323350.0000 -0.32174451 0.62141500
6: D 2 4 167.500 10891.67 5 2280.0000 3292000.0000 -2.59809850 0.97016160
计时代码:
library(data.table) #data.table_1.12.4
set.seed(0L)
np <- 4.2e5
nzc <- 4.2e3
DT <- data.table(grp1=rep(1:np, each=5), grp2=rep(1:nzc, each=np/nzc*5),
vals=abs(rnorm(np*5, 5000, 2000)), key=c("grp1", "grp2"))
mtd0 <- function() {
DT[, {
grp2_vector <- vals
.SD[,{
tres <- t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')
.(tstat=tres$statistic, pval=tres$p.value)
}, by=grp1]
} , by=grp2]
}
mtd1 <- function() {
comb <- unique(DT[, .(grp1, grp2)])[,
rbindlist(lapply(1:.N, function(n) .(g1=rep(grp1[n], .N-1L), notIn=grp1[-n]))),
.(g2=grp2)]
X <- DT[, .(nx=.N, mx=mean(vals), vx=var(vals)), .(grp1, grp2)] #, verbose=TRUE]
Y <- DT[comb, on=.(grp2=g2, grp1=notIn), allow.cartesian=TRUE][,
.(ny=.N, my=mean(vals), vy=var(vals)), by=.(grp1=g1, grp2=grp2)]
ans <- X[Y, on=.(grp1, grp2)][, c("tstat", "pval") := {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my)/stderr
.(tstat, pt(tstat, df, lower.tail = FALSE))
}, by=1:Y[,.N]]
}
microbenchmark::microbenchmark(mtd0(), mtd1(), times=1L)
时间:
Unit: seconds
expr min lq mean median uq max neval
mtd0() 65.76456 65.76456 65.76456 65.76456 65.76456 65.76456 1
mtd1() 18.29710 18.29710 18.29710 18.29710 18.29710 18.29710 1
我建议查看包 Rfast。有命令,例如ttest1,ttest2和ttests for one sample, 2 sample and many t