在 R data.table 中粘贴多个列对的有效方法
Efficient way to paste multiple column pairs in R data.table
我正在寻找一种使用 data.table
同时 paste/combine 多对相邻列的有效方法。我的微弱尝试很慢,也不是很优雅:
library(data.table)
dt <- data.table(ids = 1:3,
x1 = c("A","B","C"),
x2 = 1:3,
y1 = c("D", "E", "F"),
y2 = 4:6,
z1 = c("G", "H", "I"),
z3 = 7:9)
paste.pairs <- function(x, sep = "-"){
xx <- unlist(x)
x.len <- length(x)
r <- rep(NA, x.len/2)
s <- seq(1, x.len, by = 2)
for(i in 1:(x.len/2)) {
r[i] <- paste(xx[i], xx[i+1], sep = sep)
}
return(as.list(r))
}
dt[, paste.pairs(.SD), by = "ids"]
有没有更好的方法?
通过 seq
创建列索引的 Map
选项
i1 <- seq(1, length(dt)-1, 2)
i2 <- seq(2, length(dt)-1, 2)
dt[, Map(paste,
.SD[, i1, with = FALSE], .SD[, i2, with = FALSE],
MoreArgs = list(sep="-")),
by = "ids"]
另一种选择是按数据集的名称拆分,然后 paste
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
# x y z
#1 A-1 D-4 G-7
#2 B-2 E-5 H-8
#3 C-3 F-6 I-9
或者另一个选项是 melt/dcast
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),
.(grp = sub("\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
使用矩阵的解决方案
#create matrices
#use the columns you want to paste together...
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and convert result back to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
应该 运行 相当快,而且可读性强且易于改编。
输出
# x1 y1 z1
# 1: A-1 D-4 G-7
# 2: B-2 E-5 H-8
# 3: C-3 F-6 I-9
基准测试
microbenchmark::microbenchmark(
wimpel = {
#create matrices
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and comvert to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
},
akrun_df = {
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
},
akrun_map = {
i1 <- seq(2, length(dt), 2)
i2 <- seq(3, length(dt), 2)
dt[, Map(paste, .SD[, i1, with = FALSE], .SD[, i2, with = FALSE], MoreArgs = list(sep="-"))]
},
akrun_dcast = {
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),.(grp = sub("\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
},
times = 10 )
# Unit: microseconds
# expr min lq mean median uq max neval
# wimpel 303.072 315.122 341.2417 319.1895 327.775 531.429 10
# akrun_df 1022.790 1028.515 1251.7812 1069.1850 1172.519 2779.460 10
# akrun_map 742.013 751.051 785.6059 778.1650 799.855 884.812 10
# akrun_dcast 4104.719 4175.215 4414.6596 4348.7430 4650.911 4939.221 10
我正在寻找一种使用 data.table
同时 paste/combine 多对相邻列的有效方法。我的微弱尝试很慢,也不是很优雅:
library(data.table)
dt <- data.table(ids = 1:3,
x1 = c("A","B","C"),
x2 = 1:3,
y1 = c("D", "E", "F"),
y2 = 4:6,
z1 = c("G", "H", "I"),
z3 = 7:9)
paste.pairs <- function(x, sep = "-"){
xx <- unlist(x)
x.len <- length(x)
r <- rep(NA, x.len/2)
s <- seq(1, x.len, by = 2)
for(i in 1:(x.len/2)) {
r[i] <- paste(xx[i], xx[i+1], sep = sep)
}
return(as.list(r))
}
dt[, paste.pairs(.SD), by = "ids"]
有没有更好的方法?
通过 seq
Map
选项
i1 <- seq(1, length(dt)-1, 2)
i2 <- seq(2, length(dt)-1, 2)
dt[, Map(paste,
.SD[, i1, with = FALSE], .SD[, i2, with = FALSE],
MoreArgs = list(sep="-")),
by = "ids"]
另一种选择是按数据集的名称拆分,然后 paste
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
# x y z
#1 A-1 D-4 G-7
#2 B-2 E-5 H-8
#3 C-3 F-6 I-9
或者另一个选项是 melt/dcast
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),
.(grp = sub("\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
使用矩阵的解决方案
#create matrices
#use the columns you want to paste together...
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and convert result back to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
应该 运行 相当快,而且可读性强且易于改编。
输出
# x1 y1 z1
# 1: A-1 D-4 G-7
# 2: B-2 E-5 H-8
# 3: C-3 F-6 I-9
基准测试
microbenchmark::microbenchmark(
wimpel = {
#create matrices
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and comvert to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
},
akrun_df = {
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
},
akrun_map = {
i1 <- seq(2, length(dt), 2)
i2 <- seq(3, length(dt), 2)
dt[, Map(paste, .SD[, i1, with = FALSE], .SD[, i2, with = FALSE], MoreArgs = list(sep="-"))]
},
akrun_dcast = {
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),.(grp = sub("\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
},
times = 10 )
# Unit: microseconds
# expr min lq mean median uq max neval
# wimpel 303.072 315.122 341.2417 319.1895 327.775 531.429 10
# akrun_df 1022.790 1028.515 1251.7812 1069.1850 1172.519 2779.460 10
# akrun_map 742.013 751.051 785.6059 778.1650 799.855 884.812 10
# akrun_dcast 4104.719 4175.215 4414.6596 4348.7430 4650.911 4939.221 10