有效地按列汇总(计数)
Summarize (counts) by column efficiently
我有一个很大的table类似于datadf
有300万列和行,我在堆栈溢出()中看到了一些获取我预期摘要的方法,但即使最快的速度对我的 table 来说非常慢。编辑:感谢评论,目前有几种方法令人满意。
创建示例数据
library(data.table)
datadf <- data.frame(var1 = rep(c("T","A","G","C"), each = 3), var2 = rep(c("A","T","G","C"), each = 3), var3 = rep('-', 12), stringsAsFactors = F )
datadf <- datadf[sample(1:nrow(datadf), 1000, T),sample(1:ncol(datadf), 1000, T)]
dataDT <- as.data.table(datadf)
dataDT[1:7,1:20]
# var2 var2.1 var3 var3.1 var1 var3.2 var2.2 var3.3 var2.3 var1.1 var3.4 var2.4 var1.2 var3.5 var2.5 var2.6 var1.3 var3.6 var2.7 var1.4
# 1: T T - - A - T - T A - T A - T T A - T A
# 2: G G - - G - G - G G - G G - G G G - G G
# 3: T T - - A - T - T A - T A - T T A - T A
# 4: C C - - C - C - C C - C C - C C C - C C
# 5: C C - - C - C - C C - C C - C C C - C C
# 6: A A - - T - A - A T - A T - A A T - A T
# 7: T T - - A - T - T A - T A - T T A - T A
基准
library(tidyverse)
library(microbenchmark)
lev<-unique(unlist(datadf))
microbenchmark(
#base EDITED based on comment @ fishtank
sapply(datadf, function(x) table(factor(x, levels=lev, ordered=TRUE))), #modified based on comment
#tidyverse EDITED based on comment @AntoniosK
datadf %>% gather() %>% count(key, value) %>% spread(key, n, fill = 0L), # based on comment
#data.table
dcast(melt(dataDT, id=1:1000, measure=1:1000)[,1001:1002][, `:=` (Count = .N), by=.(variable,value)], value ~ variable ,
value.var = "value", fun.aggregate = length),
# Answer from @bk18
# myDcast<-
dcast.data.table(
melt.data.table(dataDT, measure.vars = colnames(dataDT))[, .N, .(variable, value)],
value ~ variable,
value.var = "N",
fill = 0
),
times=1
)
结果
myDcast[,1:20]
# value var2 var2.1 var3 var3.1 var1 var3.2 var2.2 var3.3 var2.3 var1.1 var3.4 var2.4 var1.2 var3.5 var2.5 var2.6 var1.3 var3.6 var2.7
# 1: - 0 0 1000 1000 0 1000 0 1000 0 0 1000 0 0 1000 0 0 0 1000 0
# 2: A 254 254 0 0 280 0 254 0 254 280 0 254 280 0 254 254 280 0 254
# 3: C 230 230 0 0 230 0 230 0 230 230 0 230 230 0 230 230 230 0 230
# 4: G 236 236 0 0 236 0 236 0 236 236 0 236 236 0 236 236 236 0 236
# 5: T 280 280 0 0 254 0 280 0 280 254 0 280 254 0 280 280 254 0 280
min lq mean median uq max neval
sapply-table 89.77978 89.77978 89.77978 89.77978 89.77978 89.77978 1
gather-count-spread 849.83078 849.83078 849.83078 849.83078 849.83078 849.83078 1
dcast-melt 19938.71910 19938.71910 19938.71910 19938.71910 19938.71910 19938.71910 1
data.table @bk18 46.92746 46.92746 46.92746 46.92746 46.92746 46.92746 1
这大约是您提供的 data.table 方法速度的两倍,并且应该能很好地适应数据集的大小:
setDT(datadf)
dcast.data.table(
melt.data.table(datadf, measure.vars = colnames(datadf))[, .N, .(variable, value)],
value ~ variable,
value.var = "N",
fill = 0
)
我有兴趣查看您的完整数据集的基准测试,因为并非所有这些方法都具有相似的扩展性。
我有一个很大的table类似于datadf
有300万列和行,我在堆栈溢出(
创建示例数据
library(data.table)
datadf <- data.frame(var1 = rep(c("T","A","G","C"), each = 3), var2 = rep(c("A","T","G","C"), each = 3), var3 = rep('-', 12), stringsAsFactors = F )
datadf <- datadf[sample(1:nrow(datadf), 1000, T),sample(1:ncol(datadf), 1000, T)]
dataDT <- as.data.table(datadf)
dataDT[1:7,1:20]
# var2 var2.1 var3 var3.1 var1 var3.2 var2.2 var3.3 var2.3 var1.1 var3.4 var2.4 var1.2 var3.5 var2.5 var2.6 var1.3 var3.6 var2.7 var1.4
# 1: T T - - A - T - T A - T A - T T A - T A
# 2: G G - - G - G - G G - G G - G G G - G G
# 3: T T - - A - T - T A - T A - T T A - T A
# 4: C C - - C - C - C C - C C - C C C - C C
# 5: C C - - C - C - C C - C C - C C C - C C
# 6: A A - - T - A - A T - A T - A A T - A T
# 7: T T - - A - T - T A - T A - T T A - T A
基准
library(tidyverse)
library(microbenchmark)
lev<-unique(unlist(datadf))
microbenchmark(
#base EDITED based on comment @ fishtank
sapply(datadf, function(x) table(factor(x, levels=lev, ordered=TRUE))), #modified based on comment
#tidyverse EDITED based on comment @AntoniosK
datadf %>% gather() %>% count(key, value) %>% spread(key, n, fill = 0L), # based on comment
#data.table
dcast(melt(dataDT, id=1:1000, measure=1:1000)[,1001:1002][, `:=` (Count = .N), by=.(variable,value)], value ~ variable ,
value.var = "value", fun.aggregate = length),
# Answer from @bk18
# myDcast<-
dcast.data.table(
melt.data.table(dataDT, measure.vars = colnames(dataDT))[, .N, .(variable, value)],
value ~ variable,
value.var = "N",
fill = 0
),
times=1
)
结果
myDcast[,1:20]
# value var2 var2.1 var3 var3.1 var1 var3.2 var2.2 var3.3 var2.3 var1.1 var3.4 var2.4 var1.2 var3.5 var2.5 var2.6 var1.3 var3.6 var2.7
# 1: - 0 0 1000 1000 0 1000 0 1000 0 0 1000 0 0 1000 0 0 0 1000 0
# 2: A 254 254 0 0 280 0 254 0 254 280 0 254 280 0 254 254 280 0 254
# 3: C 230 230 0 0 230 0 230 0 230 230 0 230 230 0 230 230 230 0 230
# 4: G 236 236 0 0 236 0 236 0 236 236 0 236 236 0 236 236 236 0 236
# 5: T 280 280 0 0 254 0 280 0 280 254 0 280 254 0 280 280 254 0 280
min lq mean median uq max neval
sapply-table 89.77978 89.77978 89.77978 89.77978 89.77978 89.77978 1
gather-count-spread 849.83078 849.83078 849.83078 849.83078 849.83078 849.83078 1
dcast-melt 19938.71910 19938.71910 19938.71910 19938.71910 19938.71910 19938.71910 1
data.table @bk18 46.92746 46.92746 46.92746 46.92746 46.92746 46.92746 1
这大约是您提供的 data.table 方法速度的两倍,并且应该能很好地适应数据集的大小:
setDT(datadf)
dcast.data.table(
melt.data.table(datadf, measure.vars = colnames(datadf))[, .N, .(variable, value)],
value ~ variable,
value.var = "N",
fill = 0
)
我有兴趣查看您的完整数据集的基准测试,因为并非所有这些方法都具有相似的扩展性。