快速应用和操作到 R 中的成对列
Quickly apply & operation to pairs of columns in R
假设我有两个大 data.tables
并且需要使用 &
操作成对组合它们的列。组合由 grid
决定(将 dt1 column1 与 dt2 column2 组合等)
现在我正在使用 mclapply
循环,当我 运行 完整数据集时脚本需要几个小时。我尝试将数据转换为矩阵并使用矢量化方法,但这花费了更长的时间。有没有更快 and/or 更优雅的方法来做到这一点?
mx1 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx1 <- as.data.table(mx1)
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx2 <- as.data.table(mx2)
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # the combinations I want to evaluate
out <- new_layer <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) { # <--- mclapply loop
mx1[[col1]] & mx2[[col2]]
}, SIMPLIFY = F)
setDT(out) # convert output into data table
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
对于上下文,此数据来自基因表达矩阵,其中 1 行 = 1 个单元格
这可以不用 mapply
直接完成:只要确保 with
参数是 FALSE
即:
mx1[, grid$col1, with = FALSE] & mx2[, grid$col2, with=FALSE]
经过一番挖掘,我发现了一个名为 bit
的包,它是专门为快速布尔运算而设计的。将我的 data.table
的每一列从 logical
转换为 bit
使我的计算速度提高了 100 倍。
# Load libraries.
library(data.table)
library(bit)
# Create data set.
mx1 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # combinations I want to evaluate
# Single operation with logical matrix.
system.time({
out <- mx1[, grid$col1] & mx2[, grid$col2]
}) # 26.014s
# Loop with logical matrix.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1[, col1] & mx2[, col2]
})
}) # 31.914s
# Single operation with logical data.table.
mx1.dt <- as.data.table(mx1)
mx2.dt <- as.data.table(mx2)
system.time({
out <- mx1.dt[, grid$col1, with = F] & mx2.dt[, grid$col2, with = F] # 26.014s
}) # 32.349s
# Loop with logical data.table.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.dt[[col1]] & mx2.dt[[col2]]
})
}) # 15.031s <---- SECOND FASTEST TIME, ~2X IMPROVEMENT
# Loop with bit data.table.
mx1.bit <- mx1.dt[, lapply(.SD, as.bit)]
mx2.bit <- mx2.dt[, lapply(.SD, as.bit)]
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.bit[[col1]] & mx2.bit[[col2]]
})
}) # 0.383s <---- FASTEST TIME, ~100X IMPROVEMENT
# Convert back to logical table.
out <- setDT(out)
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
out <- out[, lapply(.SD, as.logical)]
还有一些特殊函数,例如 sum.bit
和 ri
,您可以使用它们聚合数据而无需将其转换回逻辑。
假设我有两个大 data.tables
并且需要使用 &
操作成对组合它们的列。组合由 grid
决定(将 dt1 column1 与 dt2 column2 组合等)
现在我正在使用 mclapply
循环,当我 运行 完整数据集时脚本需要几个小时。我尝试将数据转换为矩阵并使用矢量化方法,但这花费了更长的时间。有没有更快 and/or 更优雅的方法来做到这一点?
mx1 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx1 <- as.data.table(mx1)
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx2 <- as.data.table(mx2)
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # the combinations I want to evaluate
out <- new_layer <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) { # <--- mclapply loop
mx1[[col1]] & mx2[[col2]]
}, SIMPLIFY = F)
setDT(out) # convert output into data table
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
对于上下文,此数据来自基因表达矩阵,其中 1 行 = 1 个单元格
这可以不用 mapply
直接完成:只要确保 with
参数是 FALSE
即:
mx1[, grid$col1, with = FALSE] & mx2[, grid$col2, with=FALSE]
经过一番挖掘,我发现了一个名为 bit
的包,它是专门为快速布尔运算而设计的。将我的 data.table
的每一列从 logical
转换为 bit
使我的计算速度提高了 100 倍。
# Load libraries.
library(data.table)
library(bit)
# Create data set.
mx1 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # combinations I want to evaluate
# Single operation with logical matrix.
system.time({
out <- mx1[, grid$col1] & mx2[, grid$col2]
}) # 26.014s
# Loop with logical matrix.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1[, col1] & mx2[, col2]
})
}) # 31.914s
# Single operation with logical data.table.
mx1.dt <- as.data.table(mx1)
mx2.dt <- as.data.table(mx2)
system.time({
out <- mx1.dt[, grid$col1, with = F] & mx2.dt[, grid$col2, with = F] # 26.014s
}) # 32.349s
# Loop with logical data.table.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.dt[[col1]] & mx2.dt[[col2]]
})
}) # 15.031s <---- SECOND FASTEST TIME, ~2X IMPROVEMENT
# Loop with bit data.table.
mx1.bit <- mx1.dt[, lapply(.SD, as.bit)]
mx2.bit <- mx2.dt[, lapply(.SD, as.bit)]
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.bit[[col1]] & mx2.bit[[col2]]
})
}) # 0.383s <---- FASTEST TIME, ~100X IMPROVEMENT
# Convert back to logical table.
out <- setDT(out)
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
out <- out[, lapply(.SD, as.logical)]
还有一些特殊函数,例如 sum.bit
和 ri
,您可以使用它们聚合数据而无需将其转换回逻辑。