在 R / Rcpp 中转置列表的最快方法
Fastest way to transpose a list in R / Rcpp
我有一个列表:
ls <- list(c("a", "b", "c"), c("1", "2", "3"), c("foo", "bar", "baz"))
ls
#> [[1]]
#> [1] "a" "b" "c"
#> [[2]]
#> [1] "1" "2" "3"
#> [[3]]
#> [1] "foo" "bar" "baz"
我希望"transpose"给予:
resulting_ls
#> [[1]]
#> [1] "a" "1" "foo"
#> [[2]]
#> [1] "b" "2" "bar"
#> [[3]]
#> [1] "c" "3" "baz"
我可以通过以下方式实现:
mat <- matrix(unlist(ls), ncol = 3, byrow = TRUE)
resulting_ls <- lapply(1:ncol(mat), function(i) mat[, i])
但是我的真实数据非常慢...(我需要对许多列表执行此操作,每个列表都比上面的示例大得多)
我的问题:
对于大型列表,执行此操作的最快方法是什么 length(ls)
and/or length(ls[[i]])
?
- in
R
(如果情况并非如此)
- 和
Rcpp
在 data.table
包中,有一个 transpose()
函数可以做到这一点。它在 C
中实现以提高速度。
require(data.table) # v1.9.6+
transpose(ls)
# [[1]]
# [1] "a" "1" "foo"
# [[2]]
# [1] "b" "2" "bar"
# [[3]]
# [1] "c" "3" "baz"
如果列表元素的长度不相等,它还会自动填充 NA
,并且还会自动强制转换为最高的 SEXPTYPE。如有必要,您可以为 fill
参数提供不同的值。检查 ?transpose
.
"list"s 是没有 C 等效项的 R 对象,因此在 C 中操作它们只会在周围计算方面获得效率,因为实际的转置需要在 R 对象之间来回。
Arun 的 transpose
是解决此问题的简洁方法,而且似乎再好不过了。我将提供一些其他的替代方案,只是为了表明转置 "list" 可能会很古怪,也许采用不同的方法来实现最终目标可能会更好。
map = function(x) .mapply(c, x, NULL)
lap = function(x) lapply(seq_along(x[[1]]), function(i) unlist(lapply(x, "[[", i)))
library(data.table)
DT = function(x) transpose(x)
# very simple C loop that proves that `data.table::transpose` is as good as it gets
loopC = inline::cfunction(sig = c(R_ls = "list"), body = '
SEXPTYPE tp = 0;
SEXP ans, tmp;
PROTECT(ans = allocVector(VECSXP, LENGTH(VECTOR_ELT(R_ls, 0))));
for(int i = 0; i < LENGTH(R_ls); i++) {
tmp = VECTOR_ELT(R_ls, i);
if(TYPEOF(tmp) > tp) tp = TYPEOF(tmp);
}
for(int i = 0; i < LENGTH(ans); i++) SET_VECTOR_ELT(ans, i, allocVector(tp, LENGTH(R_ls)));
switch(tp) {
case LGLSXP:
case INTSXP: {
for(int i = 0; i < LENGTH(R_ls); i++) {
PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
int *ptmp = INTEGER(tmp);
for(int j = 0; j < LENGTH(ans); j++) INTEGER(VECTOR_ELT(ans, j))[i] = ptmp[j];
UNPROTECT(1);
}
break;
}
case REALSXP: {
for(int i = 0; i < LENGTH(R_ls); i++) {
PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
double *ptmp = REAL(tmp);
for(int j = 0; j < LENGTH(ans); j++) REAL(VECTOR_ELT(ans, j))[i] = ptmp[j];
UNPROTECT(1);
}
break;
}
case STRSXP: {
for(int i = 0; i < LENGTH(R_ls); i++) {
PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
for(int j = 0; j < LENGTH(ans); j++) SET_STRING_ELT(VECTOR_ELT(ans, j), i, STRING_ELT(tmp, j));
UNPROTECT(1);
}
break;
}
}
UNPROTECT(1);
return(ans);
')
spl = function(x) split(unlist(x), rep(seq_along(x[[1]]), length(x)))
map(ls)
#[[1]]
#[1] "a" "1" "foo"
#
#[[2]]
#[1] "b" "2" "bar"
#
#[[3]]
#[1] "c" "3" "baz"
#
lap(ls)
#[[1]]
#[1] "a" "1" "foo"
#
#[[2]]
#[1] "b" "2" "bar"
#
#[[3]]
#[1] "c" "3" "baz"
#
DT(ls)
#[[1]]
#[1] "a" "1" "foo"
#
#[[2]]
#[1] "b" "2" "bar"
#
#[[3]]
#[1] "c" "3" "baz"
#
loopC(ls)
#[[1]]
#[1] "a" "1" "foo"
#
#[[2]]
#[1] "b" "2" "bar"
#
#[[3]]
#[1] "c" "3" "baz"
#
spl(ls)
#$`1`
#[1] "a" "1" "foo"
#
#$`2`
#[1] "b" "2" "bar"
#
#$`3`
#[1] "c" "3" "baz"
还有一个基准:
myls1 = rep_len(list(sample(1e3), runif(1e3), sample(letters, 1e3, T)), 1e3) #1e3 x 1e3
myls2 = rep_len(list(sample(1e5), runif(1e5), sample(letters, 1e5, T)), 1e1) #10 x 1e5
myls3 = rep_len(list(sample(1e1), runif(1e1), sample(letters, 1e1, T)), 1e5) #1e5 x 10
identical(map(myls1), lap(myls1))
#[1] TRUE
identical(map(myls1), DT(myls1))
#[1] TRUE
identical(map(myls1), loopC(myls1))
#[1] TRUE
identical(map(myls1), unname(spl(myls1)))
#[1] TRUE
microbenchmark::microbenchmark(map(myls1), lap(myls1), DT(myls1), loopC(myls1), spl(myls1),
map(myls2), lap(myls2), DT(myls2), loopC(myls2), spl(myls2),
map(myls3), lap(myls3), DT(myls3), loopC(myls3), spl(myls3),
times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# map(myls1) 1141.9477 1187.8107 1281.4314 1331.4490 1961.8452 10
# lap(myls1) 1082.7023 1104.6467 1182.8303 1219.5397 1695.6164 10
# DT(myls1) 378.0574 399.7339 433.4307 459.0293 495.2200 10
# loopC(myls1) 390.0305 392.5139 405.6461 480.7480 638.9145 10
# spl(myls1) 676.2639 756.1798 786.8639 821.7699 869.0219 10
# map(myls2) 1241.1010 1304.2250 1386.1915 1439.5182 1546.3835 10
# lap(myls2) 1823.2029 1922.1878 1965.6653 2006.6102 2161.9819 10
# DT(myls2) 471.5797 521.7380 554.2221 578.3043 887.1452 10
# loopC(myls2) 472.5713 494.9302 524.2538 591.0493 657.6087 10
# spl(myls2) 1108.1530 1117.7448 1212.0051 1297.8838 1336.8266 10
# map(myls3) 2005.1325 2178.3739 2214.1824 2451.7050 2539.5152 10
# lap(myls3) 1172.3033 1215.1297 1242.0294 1292.7345 1434.1707 10
# DT(myls3) 388.6679 393.5446 416.5494 479.1473 721.0758 10
# loopC(myls3) 389.4098 396.6768 404.9609 432.4390 451.8912 10
# spl(myls3) 675.7749 704.3328 767.0548 817.7189 937.1469 10
还有collapse::t_list
。在对@alexis_laz的数据进行计时时,似乎只比data.table::transpose
:
慢了一点点
microbenchmark::microbenchmark(transpose(myls1), t_list(myls1),
transpose(myls2), t_list(myls2),
transpose(myls3), t_list(myls3), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
transpose(myls1) 436.6397 441.3445 451.1324 448.5867 458.2563 475.3357 10
t_list(myls1) 450.9826 455.6178 463.0933 457.0395 472.9860 487.6187 10
transpose(myls2) 519.2504 522.1071 541.8283 538.9593 551.2684 587.8375 10
t_list(myls2) 570.2164 593.7727 615.3212 609.3971 637.4703 667.4535 10
transpose(myls3) 424.6462 426.9211 448.7534 436.2498 458.3884 511.8618 10
t_list(myls3) 458.9503 463.6034 497.4220 496.6448 512.9335 575.0476 10
这里有两个基本的 R 选项
f_asplit <- function(x) asplit(do.call(rbind, x), 2)
f_split <- function(x) split(unlist(x), sequence(lengths(x)))
当运行
f_DT <- function(x) transpose(x)
f_asplit <- function(x) asplit(do.call(rbind, x), 2)
f_split <- function(x) split(unlist(x), sequence(lengths(x)))
microbenchmark(
f_DT(myls1),
f_asplit(myls1),
f_split(myls1),
f_DT(myls2),
f_asplit(myls2),
f_split(myls2),
f_DT(myls3),
f_asplit(myls3),
f_split(myls3),
times = 10
)
你会看到
Unit: milliseconds
expr min lq mean median uq max neval
f_DT(myls1) 484.3706 496.1017 527.5104 504.3836 538.3450 623.4342 10
f_asplit(myls1) 502.1010 525.0664 541.2588 545.2604 553.6868 576.6521 10
f_split(myls1) 506.6380 526.0220 563.3163 544.7467 567.0422 685.8991 10
f_DT(myls2) 605.1234 632.6032 679.1095 647.2061 690.4973 873.7956 10
f_asplit(myls2) 827.3669 844.2319 870.9969 874.4586 889.0711 936.3875 10
f_split(myls2) 839.3327 854.0782 883.0190 875.9137 906.0532 957.9787 10
f_DT(myls3) 446.2920 455.3569 482.9587 467.3739 491.2713 593.4374 10
f_asplit(myls3) 506.3268 523.0180 577.8359 573.9360 618.5669 662.0888 10
f_split(myls3) 478.5380 496.5128 519.4187 507.3265 546.4502 592.4853 10
我有一个列表:
ls <- list(c("a", "b", "c"), c("1", "2", "3"), c("foo", "bar", "baz"))
ls
#> [[1]]
#> [1] "a" "b" "c"
#> [[2]]
#> [1] "1" "2" "3"
#> [[3]]
#> [1] "foo" "bar" "baz"
我希望"transpose"给予:
resulting_ls
#> [[1]]
#> [1] "a" "1" "foo"
#> [[2]]
#> [1] "b" "2" "bar"
#> [[3]]
#> [1] "c" "3" "baz"
我可以通过以下方式实现:
mat <- matrix(unlist(ls), ncol = 3, byrow = TRUE)
resulting_ls <- lapply(1:ncol(mat), function(i) mat[, i])
但是我的真实数据非常慢...(我需要对许多列表执行此操作,每个列表都比上面的示例大得多)
我的问题:
对于大型列表,执行此操作的最快方法是什么 length(ls)
and/or length(ls[[i]])
?
- in
R
(如果情况并非如此) - 和
Rcpp
在 data.table
包中,有一个 transpose()
函数可以做到这一点。它在 C
中实现以提高速度。
require(data.table) # v1.9.6+
transpose(ls)
# [[1]]
# [1] "a" "1" "foo"
# [[2]]
# [1] "b" "2" "bar"
# [[3]]
# [1] "c" "3" "baz"
如果列表元素的长度不相等,它还会自动填充 NA
,并且还会自动强制转换为最高的 SEXPTYPE。如有必要,您可以为 fill
参数提供不同的值。检查 ?transpose
.
"list"s 是没有 C 等效项的 R 对象,因此在 C 中操作它们只会在周围计算方面获得效率,因为实际的转置需要在 R 对象之间来回。
Arun 的 transpose
是解决此问题的简洁方法,而且似乎再好不过了。我将提供一些其他的替代方案,只是为了表明转置 "list" 可能会很古怪,也许采用不同的方法来实现最终目标可能会更好。
map = function(x) .mapply(c, x, NULL)
lap = function(x) lapply(seq_along(x[[1]]), function(i) unlist(lapply(x, "[[", i)))
library(data.table)
DT = function(x) transpose(x)
# very simple C loop that proves that `data.table::transpose` is as good as it gets
loopC = inline::cfunction(sig = c(R_ls = "list"), body = '
SEXPTYPE tp = 0;
SEXP ans, tmp;
PROTECT(ans = allocVector(VECSXP, LENGTH(VECTOR_ELT(R_ls, 0))));
for(int i = 0; i < LENGTH(R_ls); i++) {
tmp = VECTOR_ELT(R_ls, i);
if(TYPEOF(tmp) > tp) tp = TYPEOF(tmp);
}
for(int i = 0; i < LENGTH(ans); i++) SET_VECTOR_ELT(ans, i, allocVector(tp, LENGTH(R_ls)));
switch(tp) {
case LGLSXP:
case INTSXP: {
for(int i = 0; i < LENGTH(R_ls); i++) {
PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
int *ptmp = INTEGER(tmp);
for(int j = 0; j < LENGTH(ans); j++) INTEGER(VECTOR_ELT(ans, j))[i] = ptmp[j];
UNPROTECT(1);
}
break;
}
case REALSXP: {
for(int i = 0; i < LENGTH(R_ls); i++) {
PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
double *ptmp = REAL(tmp);
for(int j = 0; j < LENGTH(ans); j++) REAL(VECTOR_ELT(ans, j))[i] = ptmp[j];
UNPROTECT(1);
}
break;
}
case STRSXP: {
for(int i = 0; i < LENGTH(R_ls); i++) {
PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp));
for(int j = 0; j < LENGTH(ans); j++) SET_STRING_ELT(VECTOR_ELT(ans, j), i, STRING_ELT(tmp, j));
UNPROTECT(1);
}
break;
}
}
UNPROTECT(1);
return(ans);
')
spl = function(x) split(unlist(x), rep(seq_along(x[[1]]), length(x)))
map(ls)
#[[1]]
#[1] "a" "1" "foo"
#
#[[2]]
#[1] "b" "2" "bar"
#
#[[3]]
#[1] "c" "3" "baz"
#
lap(ls)
#[[1]]
#[1] "a" "1" "foo"
#
#[[2]]
#[1] "b" "2" "bar"
#
#[[3]]
#[1] "c" "3" "baz"
#
DT(ls)
#[[1]]
#[1] "a" "1" "foo"
#
#[[2]]
#[1] "b" "2" "bar"
#
#[[3]]
#[1] "c" "3" "baz"
#
loopC(ls)
#[[1]]
#[1] "a" "1" "foo"
#
#[[2]]
#[1] "b" "2" "bar"
#
#[[3]]
#[1] "c" "3" "baz"
#
spl(ls)
#$`1`
#[1] "a" "1" "foo"
#
#$`2`
#[1] "b" "2" "bar"
#
#$`3`
#[1] "c" "3" "baz"
还有一个基准:
myls1 = rep_len(list(sample(1e3), runif(1e3), sample(letters, 1e3, T)), 1e3) #1e3 x 1e3
myls2 = rep_len(list(sample(1e5), runif(1e5), sample(letters, 1e5, T)), 1e1) #10 x 1e5
myls3 = rep_len(list(sample(1e1), runif(1e1), sample(letters, 1e1, T)), 1e5) #1e5 x 10
identical(map(myls1), lap(myls1))
#[1] TRUE
identical(map(myls1), DT(myls1))
#[1] TRUE
identical(map(myls1), loopC(myls1))
#[1] TRUE
identical(map(myls1), unname(spl(myls1)))
#[1] TRUE
microbenchmark::microbenchmark(map(myls1), lap(myls1), DT(myls1), loopC(myls1), spl(myls1),
map(myls2), lap(myls2), DT(myls2), loopC(myls2), spl(myls2),
map(myls3), lap(myls3), DT(myls3), loopC(myls3), spl(myls3),
times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# map(myls1) 1141.9477 1187.8107 1281.4314 1331.4490 1961.8452 10
# lap(myls1) 1082.7023 1104.6467 1182.8303 1219.5397 1695.6164 10
# DT(myls1) 378.0574 399.7339 433.4307 459.0293 495.2200 10
# loopC(myls1) 390.0305 392.5139 405.6461 480.7480 638.9145 10
# spl(myls1) 676.2639 756.1798 786.8639 821.7699 869.0219 10
# map(myls2) 1241.1010 1304.2250 1386.1915 1439.5182 1546.3835 10
# lap(myls2) 1823.2029 1922.1878 1965.6653 2006.6102 2161.9819 10
# DT(myls2) 471.5797 521.7380 554.2221 578.3043 887.1452 10
# loopC(myls2) 472.5713 494.9302 524.2538 591.0493 657.6087 10
# spl(myls2) 1108.1530 1117.7448 1212.0051 1297.8838 1336.8266 10
# map(myls3) 2005.1325 2178.3739 2214.1824 2451.7050 2539.5152 10
# lap(myls3) 1172.3033 1215.1297 1242.0294 1292.7345 1434.1707 10
# DT(myls3) 388.6679 393.5446 416.5494 479.1473 721.0758 10
# loopC(myls3) 389.4098 396.6768 404.9609 432.4390 451.8912 10
# spl(myls3) 675.7749 704.3328 767.0548 817.7189 937.1469 10
还有collapse::t_list
。在对@alexis_laz的数据进行计时时,似乎只比data.table::transpose
:
microbenchmark::microbenchmark(transpose(myls1), t_list(myls1),
transpose(myls2), t_list(myls2),
transpose(myls3), t_list(myls3), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
transpose(myls1) 436.6397 441.3445 451.1324 448.5867 458.2563 475.3357 10
t_list(myls1) 450.9826 455.6178 463.0933 457.0395 472.9860 487.6187 10
transpose(myls2) 519.2504 522.1071 541.8283 538.9593 551.2684 587.8375 10
t_list(myls2) 570.2164 593.7727 615.3212 609.3971 637.4703 667.4535 10
transpose(myls3) 424.6462 426.9211 448.7534 436.2498 458.3884 511.8618 10
t_list(myls3) 458.9503 463.6034 497.4220 496.6448 512.9335 575.0476 10
这里有两个基本的 R 选项
f_asplit <- function(x) asplit(do.call(rbind, x), 2)
f_split <- function(x) split(unlist(x), sequence(lengths(x)))
当运行
f_DT <- function(x) transpose(x)
f_asplit <- function(x) asplit(do.call(rbind, x), 2)
f_split <- function(x) split(unlist(x), sequence(lengths(x)))
microbenchmark(
f_DT(myls1),
f_asplit(myls1),
f_split(myls1),
f_DT(myls2),
f_asplit(myls2),
f_split(myls2),
f_DT(myls3),
f_asplit(myls3),
f_split(myls3),
times = 10
)
你会看到
Unit: milliseconds
expr min lq mean median uq max neval
f_DT(myls1) 484.3706 496.1017 527.5104 504.3836 538.3450 623.4342 10
f_asplit(myls1) 502.1010 525.0664 541.2588 545.2604 553.6868 576.6521 10
f_split(myls1) 506.6380 526.0220 563.3163 544.7467 567.0422 685.8991 10
f_DT(myls2) 605.1234 632.6032 679.1095 647.2061 690.4973 873.7956 10
f_asplit(myls2) 827.3669 844.2319 870.9969 874.4586 889.0711 936.3875 10
f_split(myls2) 839.3327 854.0782 883.0190 875.9137 906.0532 957.9787 10
f_DT(myls3) 446.2920 455.3569 482.9587 467.3739 491.2713 593.4374 10
f_asplit(myls3) 506.3268 523.0180 577.8359 573.9360 618.5669 662.0888 10
f_split(myls3) 478.5380 496.5128 519.4187 507.3265 546.4502 592.4853 10