重新启动的累积总和,R 中的优化
Cumulative sum with restart, optimization in R
我有 34 个栅格(nrow:17735,ncol:11328,ncell:200902080),值为 0 和 1,每个 4Mb。我想要这些值的累加和归零。
我尝试了几种替代方案,基于:
library(microbenchmark)
library(compiler)
library(dplyr)
library(data.table)
x <- c(0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)
fun = function(x)
{ cs = cumsum(x)
cs - cummax((x == 0) * cs)
}
funC <- cmpfun(fun)
microbenchmark(
funcioEx = fun(x),
funComEx = funC(x),
lapplyEx = unname(unlist(lapply(split(x,cumsum(c(0,diff(x) != 0))), cumsum))),
dataTaEx = data.table(x)[, whatiwant := cumsum(x), by = rleid(x==0L)],
reduceEx = Reduce(function(x, y) if (y == 0) 0 else x+y, x, accumulate=TRUE)
)
我想针对我的数据优化此过程,因为使用第二个选项(funComEx,最快)大约需要 3 小时。
Rcpp
可能有点帮助
library(Rcpp)
cppFunction(
"IntegerVector foo(NumericVector vect){
int N = vect.size();
IntegerVector ans(N);
ans[0] = vect[0];
for (int i = 1; i < N; i++){
if(vect[i] > 0){
ans[i] = ans[i-1] + vect[i];
} else {
ans[i] = 0;
}
}
return(ans);
}"
)
set.seed(42)
x = sample(0:1, 1e4, TRUE)
identical(foo(x), fun(x))
#[1] TRUE
microbenchmark(
funcioEx = fun(x),
funComEx = funC(x),
lapplyEx = unname(unlist(lapply(split(x,cumsum(c(0,diff(x) != 0))), cumsum))),
dataTaEx = data.table(x)[, whatiwant := cumsum(x), by = rleid(x==0L)],
reduceEx = Reduce(function(x, y) if (y == 0) 0 else x+y, x, accumulate=TRUE),
foo_RCPP = foo(x)
)
#Unit: microseconds
# expr min lq mean median uq max neval
# funcioEx 98.238 104.2495 118.81500 113.1935 121.1110 280.637 100
# funComEx 97.358 103.2230 113.81515 112.1670 118.1785 220.522 100
# lapplyEx 17810.638 18888.9055 20130.20765 19399.7415 20641.0550 28073.981 100
# dataTaEx 3435.387 3832.0025 4468.77932 4023.6395 4347.3840 17053.181 100
# reduceEx 7472.515 8174.4020 9614.23122 8634.7985 10177.1305 15719.788 100
# foo_RCPP 52.491 62.6085 80.65777 66.5670 72.4320 1102.315 100
我有 34 个栅格(nrow:17735,ncol:11328,ncell:200902080),值为 0 和 1,每个 4Mb。我想要这些值的累加和归零。
我尝试了几种替代方案,基于:
library(microbenchmark)
library(compiler)
library(dplyr)
library(data.table)
x <- c(0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)
fun = function(x)
{ cs = cumsum(x)
cs - cummax((x == 0) * cs)
}
funC <- cmpfun(fun)
microbenchmark(
funcioEx = fun(x),
funComEx = funC(x),
lapplyEx = unname(unlist(lapply(split(x,cumsum(c(0,diff(x) != 0))), cumsum))),
dataTaEx = data.table(x)[, whatiwant := cumsum(x), by = rleid(x==0L)],
reduceEx = Reduce(function(x, y) if (y == 0) 0 else x+y, x, accumulate=TRUE)
)
我想针对我的数据优化此过程,因为使用第二个选项(funComEx,最快)大约需要 3 小时。
Rcpp
可能有点帮助
library(Rcpp)
cppFunction(
"IntegerVector foo(NumericVector vect){
int N = vect.size();
IntegerVector ans(N);
ans[0] = vect[0];
for (int i = 1; i < N; i++){
if(vect[i] > 0){
ans[i] = ans[i-1] + vect[i];
} else {
ans[i] = 0;
}
}
return(ans);
}"
)
set.seed(42)
x = sample(0:1, 1e4, TRUE)
identical(foo(x), fun(x))
#[1] TRUE
microbenchmark(
funcioEx = fun(x),
funComEx = funC(x),
lapplyEx = unname(unlist(lapply(split(x,cumsum(c(0,diff(x) != 0))), cumsum))),
dataTaEx = data.table(x)[, whatiwant := cumsum(x), by = rleid(x==0L)],
reduceEx = Reduce(function(x, y) if (y == 0) 0 else x+y, x, accumulate=TRUE),
foo_RCPP = foo(x)
)
#Unit: microseconds
# expr min lq mean median uq max neval
# funcioEx 98.238 104.2495 118.81500 113.1935 121.1110 280.637 100
# funComEx 97.358 103.2230 113.81515 112.1670 118.1785 220.522 100
# lapplyEx 17810.638 18888.9055 20130.20765 19399.7415 20641.0550 28073.981 100
# dataTaEx 3435.387 3832.0025 4468.77932 4023.6395 4347.3840 17053.181 100
# reduceEx 7472.515 8174.4020 9614.23122 8634.7985 10177.1305 15719.788 100
# foo_RCPP 52.491 62.6085 80.65777 66.5670 72.4320 1102.315 100