在 data.table 中滚动累积
Rolling cumsum in data.table
尝试在 data.table 中按组获取(反向)移动 window 中的累积和。例如,我想从以下数据中获取“roll_cumsum”列中的那些值:
dt = data.table()
dt[, a := seq(1, 10, 1)]
dt[, group := rep(1:2, each = 5)]
dt[, roll_cumsum := c(15, 14, 12, 9, 5, 40, 34, 27, 19, 10)]
我用下面的代码得到了我想要的结果,但是对于大型数据集来说它很慢:
partial_sum = function(x) { n <- seq_along(x); cumsum(x)[length(x)] - cumsum(x)[n] + x[n] }
dt[, partial_sum(a), by = group]
有什么可以加快计算速度的建议吗?非常感谢!
有一个revcumsum
函数
library(spatstat.utils)
dt[, roll_cumsum2 := revcumsum(a), group]
-输出
dt
# a group roll_cumsum roll_cumsum2
# 1: 1 1 15 15
# 2: 2 1 14 14
# 3: 3 1 12 12
# 4: 4 1 9 9
# 5: 5 1 5 5
# 6: 6 2 40 40
# 7: 7 2 34 34
# 8: 8 2 27 27
# 9: 9 2 19 19
#10: 10 2 10 10
或者只做 rev
erse
dt[, roll_cumsum2 := rev(cumsum(rev(a))), group]
-输出
dt
# a group roll_cumsum roll_cumsum2
# 1: 1 1 15 15
# 2: 2 1 14 14
# 3: 3 1 12 12
# 4: 4 1 9 9
# 5: 5 1 5 5
# 6: 6 2 40 40
# 7: 7 2 34 34
# 8: 8 2 27 27
# 9: 9 2 19 19
#10: 10 2 10 10
或者另一种方式是
dt[, roll_cumsum2 := cumsum(a[.N:1])[.N:1], group]
注意:两者都是精简版
基准
dt1 <- data.table(a = 1:1e7, group = rep(1:1e6, length.out = 1e7, 10))
system.time(dt1[, roll_cumsum := partial_sum(a), by = group])
#user system elapsed
# 2.073 0.037 2.094
system.time(dt1[, roll_cumsum2 := revcumsum(a), group])
#user system elapsed
# 2.623 0.029 2.637
system.time(dt1[, roll_cumsum3 := rev(cumsum(rev(a))), group])
#user system elapsed
# 4.275 0.051 4.276
system.time(dt1[, roll_cumsum4 := cumsum(a[.N:1])[.N:1], group])
#user system elapsed
# 1.703 0.028 1.722
system.time(dt1[, roll_cumsum5 := sum(a) - cumsum(shift(a, fill = 0)), group])
# user system elapsed
# 10.095 0.041 10.129
每组的sum(a)
可以减去a
的累计和
library(data.table)
dt[, roll_cumsum1 := sum(a) - cumsum(shift(a, fill = 0)), group]
dt
# a group roll_cumsum roll_cumsum1
# 1: 1 1 15 15
# 2: 2 1 14 14
# 3: 3 1 12 12
# 4: 4 1 9 9
# 5: 5 1 5 5
# 6: 6 2 40 40
# 7: 7 2 34 34
# 8: 8 2 27 27
# 9: 9 2 19 19
#10: 10 2 10 10
尝试在 data.table 中按组获取(反向)移动 window 中的累积和。例如,我想从以下数据中获取“roll_cumsum”列中的那些值:
dt = data.table()
dt[, a := seq(1, 10, 1)]
dt[, group := rep(1:2, each = 5)]
dt[, roll_cumsum := c(15, 14, 12, 9, 5, 40, 34, 27, 19, 10)]
我用下面的代码得到了我想要的结果,但是对于大型数据集来说它很慢:
partial_sum = function(x) { n <- seq_along(x); cumsum(x)[length(x)] - cumsum(x)[n] + x[n] }
dt[, partial_sum(a), by = group]
有什么可以加快计算速度的建议吗?非常感谢!
有一个revcumsum
函数
library(spatstat.utils)
dt[, roll_cumsum2 := revcumsum(a), group]
-输出
dt
# a group roll_cumsum roll_cumsum2
# 1: 1 1 15 15
# 2: 2 1 14 14
# 3: 3 1 12 12
# 4: 4 1 9 9
# 5: 5 1 5 5
# 6: 6 2 40 40
# 7: 7 2 34 34
# 8: 8 2 27 27
# 9: 9 2 19 19
#10: 10 2 10 10
或者只做 rev
erse
dt[, roll_cumsum2 := rev(cumsum(rev(a))), group]
-输出
dt
# a group roll_cumsum roll_cumsum2
# 1: 1 1 15 15
# 2: 2 1 14 14
# 3: 3 1 12 12
# 4: 4 1 9 9
# 5: 5 1 5 5
# 6: 6 2 40 40
# 7: 7 2 34 34
# 8: 8 2 27 27
# 9: 9 2 19 19
#10: 10 2 10 10
或者另一种方式是
dt[, roll_cumsum2 := cumsum(a[.N:1])[.N:1], group]
注意:两者都是精简版
基准
dt1 <- data.table(a = 1:1e7, group = rep(1:1e6, length.out = 1e7, 10))
system.time(dt1[, roll_cumsum := partial_sum(a), by = group])
#user system elapsed
# 2.073 0.037 2.094
system.time(dt1[, roll_cumsum2 := revcumsum(a), group])
#user system elapsed
# 2.623 0.029 2.637
system.time(dt1[, roll_cumsum3 := rev(cumsum(rev(a))), group])
#user system elapsed
# 4.275 0.051 4.276
system.time(dt1[, roll_cumsum4 := cumsum(a[.N:1])[.N:1], group])
#user system elapsed
# 1.703 0.028 1.722
system.time(dt1[, roll_cumsum5 := sum(a) - cumsum(shift(a, fill = 0)), group])
# user system elapsed
# 10.095 0.041 10.129
每组的sum(a)
可以减去a
的累计和
library(data.table)
dt[, roll_cumsum1 := sum(a) - cumsum(shift(a, fill = 0)), group]
dt
# a group roll_cumsum roll_cumsum1
# 1: 1 1 15 15
# 2: 2 1 14 14
# 3: 3 1 12 12
# 4: 4 1 9 9
# 5: 5 1 5 5
# 6: 6 2 40 40
# 7: 7 2 34 34
# 8: 8 2 27 27
# 9: 9 2 19 19
#10: 10 2 10 10