data.table R 中的按行模式
Row Wise Mode in data.table R
我正在尝试找到一种有效的方法来在 data.table
中的列子集上获取行模式
#Sample data
a <- data.frame(
id=letters[],
dattyp1 = sample( 1:2, 26, replace=T) ,
dattyp2 = sample( 1:2, 26, replace=T) ,
dattyp3 = sample( 1:2, 26, replace=T) ,
dattyp4 = sample( 1:2, 26, replace=T) ,
dattyp5 = sample( 1:2, 26, replace=T) ,
dattyp6 = sample( 1:2, 26, replace=T)
)
library(modeest)
library(data.table)
我从: 知道我可以做到这一点:
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
apply(a[ ,paste0("dattyp",1:6)], 1, Mode)
但这真的很慢(超过我的数百万条记录)。我在想一定有一种方法可以用 .SDcols 来做到这一点——但这确实是按列模式而不是按行模式。
a<- data.table( a )
a[ , lapply(.SD , mfv ), .SDcols=c(paste0("dattyp",1:6) ) ]
你可以试试这个——虽然我不确定它会快多少。注意,我正在获取 mfv 返回的第一个数字。
library(modeest)
library(data.table)
a <- data.frame(
id=letters[],
dattyp1 = sample( 1:2, 26, replace=T) ,
dattyp2 = sample( 1:2, 26, replace=T) ,
dattyp3 = sample( 1:2, 26, replace=T) ,
dattyp4 = sample( 1:2, 26, replace=T) ,
dattyp5 = sample( 1:2, 26, replace=T) ,
dattyp6 = sample( 1:2, 26, replace=T)
)
a<- data.table( a )
a[ , Mode:=mfv(c(dattyp1,dattyp2,dattyp3,dattyp4,dattyp5,dattyp6))[1],by=id ]
datatable 可能会更快。
申请:
microbenchmark(apply={
+ apply(a[ ,paste0("dattyp",1:6)], 1, Mode)
+ })
Unit: microseconds
expr min lq mean median uq max neval
apply 574.025 591.803 1056.807 624.988 704.396 39236.79 100
数据表来自:
microbenchmark({
+ a[ , Mode:=mfv(c(dattyp1,dattyp2,dattyp3,dattyp4,dattyp5,dattyp6))[1],by=id ]
+ })
Unit: milliseconds
expr min lq
{ a[, `:=`(Mode, mfv(c(dattyp1, dattyp2, dattyp3, dattyp4, dattyp5, dattyp6))[1]), by = id] } 2.44109 2.748053
mean median uq max neval
3.049809 2.898769 3.139559 6.398032 100
我认为通过 data.table 最快的方法仍然是转换为关系(即 long)格式并聚合,然后在 reldtMtd
函数中找到最大值,如下所示。不知道用Rcpp会不会更快
数据:
library(data.table)
M <- 1e6
popn <- 2
set.seed(0L)
a <- data.frame(
id=1:M,
dattyp1 = sample(popn, M, replace=TRUE),
dattyp2 = sample(popn, M, replace=TRUE),
dattyp3 = sample(popn, M, replace=TRUE),
dattyp4 = sample(popn, M, replace=TRUE),
dattyp5 = sample(popn, M, replace=TRUE),
dattyp6 = sample(popn, M, replace=TRUE)
)
setDT(a)
方法:
reldtMtd <- function() {
melt(a, id.vars="id")[,
.N, by=.(id, value)][,
value[which.max(N)], by=.(id)]
}
#from
Mode <- compiler::cmpfun(function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
})
Mode2 <- compiler::cmpfun(function(x) names(which.max(table(x))))
matA <- as.matrix(a[, -1L])
baseMtd1 <- function() apply(matA, 1, Mode)
baseMtd2 <- function() apply(matA, 1, Mode2)
library(microbenchmark)
microbenchmark(reldtMtd(), baseMtd1(), baseMtd2(), times=3L)
时间安排:
Unit: seconds
expr min lq mean median uq max neval
reldtMtd() 1.882783 1.947515 2.031767 2.012248 2.106259 2.20027 3
baseMtd1() 15.618716 15.675314 15.809277 15.731913 15.904557 16.07720 3
baseMtd2() 160.837513 161.692634 162.455048 162.547755 163.263816 163.97988 3
我正在尝试找到一种有效的方法来在 data.table
中的列子集上获取行模式#Sample data
a <- data.frame(
id=letters[],
dattyp1 = sample( 1:2, 26, replace=T) ,
dattyp2 = sample( 1:2, 26, replace=T) ,
dattyp3 = sample( 1:2, 26, replace=T) ,
dattyp4 = sample( 1:2, 26, replace=T) ,
dattyp5 = sample( 1:2, 26, replace=T) ,
dattyp6 = sample( 1:2, 26, replace=T)
)
library(modeest)
library(data.table)
我从:
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
apply(a[ ,paste0("dattyp",1:6)], 1, Mode)
但这真的很慢(超过我的数百万条记录)。我在想一定有一种方法可以用 .SDcols 来做到这一点——但这确实是按列模式而不是按行模式。
a<- data.table( a )
a[ , lapply(.SD , mfv ), .SDcols=c(paste0("dattyp",1:6) ) ]
你可以试试这个——虽然我不确定它会快多少。注意,我正在获取 mfv 返回的第一个数字。
library(modeest)
library(data.table)
a <- data.frame(
id=letters[],
dattyp1 = sample( 1:2, 26, replace=T) ,
dattyp2 = sample( 1:2, 26, replace=T) ,
dattyp3 = sample( 1:2, 26, replace=T) ,
dattyp4 = sample( 1:2, 26, replace=T) ,
dattyp5 = sample( 1:2, 26, replace=T) ,
dattyp6 = sample( 1:2, 26, replace=T)
)
a<- data.table( a )
a[ , Mode:=mfv(c(dattyp1,dattyp2,dattyp3,dattyp4,dattyp5,dattyp6))[1],by=id ]
datatable 可能会更快。 申请:
microbenchmark(apply={
+ apply(a[ ,paste0("dattyp",1:6)], 1, Mode)
+ })
Unit: microseconds
expr min lq mean median uq max neval
apply 574.025 591.803 1056.807 624.988 704.396 39236.79 100
数据表来自:
microbenchmark({
+ a[ , Mode:=mfv(c(dattyp1,dattyp2,dattyp3,dattyp4,dattyp5,dattyp6))[1],by=id ]
+ })
Unit: milliseconds
expr min lq
{ a[, `:=`(Mode, mfv(c(dattyp1, dattyp2, dattyp3, dattyp4, dattyp5, dattyp6))[1]), by = id] } 2.44109 2.748053
mean median uq max neval
3.049809 2.898769 3.139559 6.398032 100
我认为通过 data.table 最快的方法仍然是转换为关系(即 long)格式并聚合,然后在 reldtMtd
函数中找到最大值,如下所示。不知道用Rcpp会不会更快
数据:
library(data.table)
M <- 1e6
popn <- 2
set.seed(0L)
a <- data.frame(
id=1:M,
dattyp1 = sample(popn, M, replace=TRUE),
dattyp2 = sample(popn, M, replace=TRUE),
dattyp3 = sample(popn, M, replace=TRUE),
dattyp4 = sample(popn, M, replace=TRUE),
dattyp5 = sample(popn, M, replace=TRUE),
dattyp6 = sample(popn, M, replace=TRUE)
)
setDT(a)
方法:
reldtMtd <- function() {
melt(a, id.vars="id")[,
.N, by=.(id, value)][,
value[which.max(N)], by=.(id)]
}
#from
Mode <- compiler::cmpfun(function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
})
Mode2 <- compiler::cmpfun(function(x) names(which.max(table(x))))
matA <- as.matrix(a[, -1L])
baseMtd1 <- function() apply(matA, 1, Mode)
baseMtd2 <- function() apply(matA, 1, Mode2)
library(microbenchmark)
microbenchmark(reldtMtd(), baseMtd1(), baseMtd2(), times=3L)
时间安排:
Unit: seconds
expr min lq mean median uq max neval
reldtMtd() 1.882783 1.947515 2.031767 2.012248 2.106259 2.20027 3
baseMtd1() 15.618716 15.675314 15.809277 15.731913 15.904557 16.07720 3
baseMtd2() 160.837513 161.692634 162.455048 162.547755 163.263816 163.97988 3