具有条件和重置的 R 累积和
R Cumulative Sum with a condition and a reset
我有一个由 -1s 和 1s 组成的信号位置指示符向量。此外,我有体积数据,我想根据 Signal 的值求和。基本数据table是这样的:
df <- cbind(Signal, Volume)
head(df, 20)
Signal Volume
2016-01-04 NA 37912403
2016-01-05 -1 23258238
2016-01-06 -1 25096183
2016-01-07 -1 45172906
2016-01-08 -1 35402298
2016-01-11 -1 29932385
2016-01-12 -1 28395390
2016-01-13 -1 33410553
2016-01-14 -1 48658623
2016-01-15 1 46132781
2016-01-19 1 30998256
2016-01-20 -1 59051429
2016-01-21 1 30518939
2016-01-22 1 30495387
2016-01-25 1 32482015
2016-01-26 -1 26877080
2016-01-27 -1 58699359
2016-01-28 1 107475327
2016-01-29 1 62739548
2016-02-01 1 46132726
我想要实现的是(不使用 for 循环)生成一个 cum Volume 向量,每次信号改变时都会重置它。此外,音量值应乘以信号值,即当信号为 -1 时,应将 -Volume 添加到当前 cum Volume。
基于 SO 上的类似问题,我尝试了
ave(df$a, cumsum(c(F, diff(sign(diff(df$a))) != 0)*df$Volume), FUN=seq_along)
产生正确的信号分组,但由于某种原因不包括音量。没有重置,解决方案相当简单(发布在 SO 上)
require(data.table)
DT <- data.table(dt)
DT[, Cum.Sum := cumsum(Volume), by=Signal]
有谁知道 dplyr 或 data.table 种用于重置和调节 cum sum 的解决方案?谢谢
这可以通过以下方式实现:
library(tidyverse)
library(data.table)
z %>%
group_by(rleid(Signal)) %>% #advance value every time Signal changes and group by that
mutate(cum = Signal*cumsum(Volume)) %>% #cumsum in each group
ungroup() %>% #ungroup so you could remove the grouping column
select(-4) #remove grouping column
或不使用 data.table
使用 rle
:
z %>%
mutate(rl = rep(1:length(rle(Signal)$length), times = rle(Signal)$length)) %>%
group_by(rl) %>%
mutate(cum = Signal*cumsum(Volume)) %>%
ungroup() %>%
select(-4)
#output
date Signal Volume cum
<fct> <int> <int> <int>
1 2016-01-04 NA 37912403 NA
2 2016-01-05 - 1 23258238 - 23258238
3 2016-01-06 - 1 25096183 - 48354421
4 2016-01-07 - 1 45172906 - 93527327
5 2016-01-08 - 1 35402298 -128929625
6 2016-01-11 - 1 29932385 -158862010
7 2016-01-12 - 1 28395390 -187257400
8 2016-01-13 - 1 33410553 -220667953
9 2016-01-14 - 1 48658623 -269326576
10 2016-01-15 1 46132781 46132781
11 2016-01-19 1 30998256 77131037
12 2016-01-20 - 1 59051429 - 59051429
13 2016-01-21 1 30518939 30518939
14 2016-01-22 1 30495387 61014326
15 2016-01-25 1 32482015 93496341
16 2016-01-26 - 1 26877080 - 26877080
17 2016-01-27 - 1 58699359 - 85576439
18 2016-01-28 1 107475327 107475327
19 2016-01-29 1 62739548 170214875
20 2016-02-01 1 46132726 216347601
数据:
z <- read.table(text = "date Signal Volume
2016-01-04 NA 37912403
2016-01-05 -1 23258238
2016-01-06 -1 25096183
2016-01-07 -1 45172906
2016-01-08 -1 35402298
2016-01-11 -1 29932385
2016-01-12 -1 28395390
2016-01-13 -1 33410553
2016-01-14 -1 48658623
2016-01-15 1 46132781
2016-01-19 1 30998256
2016-01-20 -1 59051429
2016-01-21 1 30518939
2016-01-22 1 30495387
2016-01-25 1 32482015
2016-01-26 -1 26877080
2016-01-27 -1 58699359
2016-01-28 1 107475327
2016-01-29 1 62739548
2016-02-01 1 46132726", header = T)
一个纯粹的dplyr
方式是:
df %>%
na.omit() %>% # omit NA to not multiply by NA
mutate(isStep = (Signal - lag(Signal, 1)) != 0) %>% # Create a dummy variable for steps
mutate(isStep = ifelse(is.na(isStep), FALSE, isStep)) %>%
mutate(grp = cumsum(isStep)) %>% # create new ID based on steps
group_by(grp) %>% # group by before created steps
mutate(res = cumsum(Signal * Volume)) %>% # calculate value
select(x, Signal, Volume, res)
# # A tibble: 19 x 5
# # Groups: grp [6]
# grp x Signal Volume res
# <int> <fctr> <int> <int> <int>
# 1 0 2016-01-05 -1 23258238 -23258238
# 2 0 2016-01-06 -1 25096183 -48354421
# 3 0 2016-01-07 -1 45172906 -93527327
# 4 0 2016-01-08 -1 35402298 -128929625
# 5 0 2016-01-11 -1 29932385 -158862010
# 6 0 2016-01-12 -1 28395390 -187257400
# 7 0 2016-01-13 -1 33410553 -220667953
# 8 0 2016-01-14 -1 48658623 -269326576
# 9 1 2016-01-15 1 46132781 46132781
# 10 1 2016-01-19 1 30998256 77131037
# 11 2 2016-01-20 -1 59051429 -59051429
# 12 3 2016-01-21 1 30518939 30518939
# 13 3 2016-01-22 1 30495387 61014326
# 14 3 2016-01-25 1 32482015 93496341
# 15 4 2016-01-26 -1 26877080 -26877080
# 16 4 2016-01-27 -1 58699359 -85576439
# 17 5 2016-01-28 1 107475327 107475327
# 18 5 2016-01-29 1 62739548 170214875
# 19 5 2016-02-01 1 46132726 216347601
根据@docendo 的建议,这应该有效:
df[,cum := cumsum(Volume)*Signal,.(rleid(Signal))]
date Signal Volume cum
1: 2016-01-04 NA 37912403 NA
2: 2016-01-05 -1 23258238 -23258238
3: 2016-01-06 -1 25096183 -48354421
4: 2016-01-07 -1 45172906 -93527327
5: 2016-01-08 -1 35402298 -128929625
6: 2016-01-11 -1 29932385 -158862010
7: 2016-01-12 -1 28395390 -187257400
8: 2016-01-13 -1 33410553 -220667953
9: 2016-01-14 -1 48658623 -269326576
10: 2016-01-15 1 46132781 46132781
11: 2016-01-19 1 30998256 77131037
我有一个由 -1s 和 1s 组成的信号位置指示符向量。此外,我有体积数据,我想根据 Signal 的值求和。基本数据table是这样的:
df <- cbind(Signal, Volume)
head(df, 20)
Signal Volume
2016-01-04 NA 37912403
2016-01-05 -1 23258238
2016-01-06 -1 25096183
2016-01-07 -1 45172906
2016-01-08 -1 35402298
2016-01-11 -1 29932385
2016-01-12 -1 28395390
2016-01-13 -1 33410553
2016-01-14 -1 48658623
2016-01-15 1 46132781
2016-01-19 1 30998256
2016-01-20 -1 59051429
2016-01-21 1 30518939
2016-01-22 1 30495387
2016-01-25 1 32482015
2016-01-26 -1 26877080
2016-01-27 -1 58699359
2016-01-28 1 107475327
2016-01-29 1 62739548
2016-02-01 1 46132726
我想要实现的是(不使用 for 循环)生成一个 cum Volume 向量,每次信号改变时都会重置它。此外,音量值应乘以信号值,即当信号为 -1 时,应将 -Volume 添加到当前 cum Volume。 基于 SO 上的类似问题,我尝试了
ave(df$a, cumsum(c(F, diff(sign(diff(df$a))) != 0)*df$Volume), FUN=seq_along)
产生正确的信号分组,但由于某种原因不包括音量。没有重置,解决方案相当简单(发布在 SO 上)
require(data.table)
DT <- data.table(dt)
DT[, Cum.Sum := cumsum(Volume), by=Signal]
有谁知道 dplyr 或 data.table 种用于重置和调节 cum sum 的解决方案?谢谢
这可以通过以下方式实现:
library(tidyverse)
library(data.table)
z %>%
group_by(rleid(Signal)) %>% #advance value every time Signal changes and group by that
mutate(cum = Signal*cumsum(Volume)) %>% #cumsum in each group
ungroup() %>% #ungroup so you could remove the grouping column
select(-4) #remove grouping column
或不使用 data.table
使用 rle
:
z %>%
mutate(rl = rep(1:length(rle(Signal)$length), times = rle(Signal)$length)) %>%
group_by(rl) %>%
mutate(cum = Signal*cumsum(Volume)) %>%
ungroup() %>%
select(-4)
#output
date Signal Volume cum
<fct> <int> <int> <int>
1 2016-01-04 NA 37912403 NA
2 2016-01-05 - 1 23258238 - 23258238
3 2016-01-06 - 1 25096183 - 48354421
4 2016-01-07 - 1 45172906 - 93527327
5 2016-01-08 - 1 35402298 -128929625
6 2016-01-11 - 1 29932385 -158862010
7 2016-01-12 - 1 28395390 -187257400
8 2016-01-13 - 1 33410553 -220667953
9 2016-01-14 - 1 48658623 -269326576
10 2016-01-15 1 46132781 46132781
11 2016-01-19 1 30998256 77131037
12 2016-01-20 - 1 59051429 - 59051429
13 2016-01-21 1 30518939 30518939
14 2016-01-22 1 30495387 61014326
15 2016-01-25 1 32482015 93496341
16 2016-01-26 - 1 26877080 - 26877080
17 2016-01-27 - 1 58699359 - 85576439
18 2016-01-28 1 107475327 107475327
19 2016-01-29 1 62739548 170214875
20 2016-02-01 1 46132726 216347601
数据:
z <- read.table(text = "date Signal Volume
2016-01-04 NA 37912403
2016-01-05 -1 23258238
2016-01-06 -1 25096183
2016-01-07 -1 45172906
2016-01-08 -1 35402298
2016-01-11 -1 29932385
2016-01-12 -1 28395390
2016-01-13 -1 33410553
2016-01-14 -1 48658623
2016-01-15 1 46132781
2016-01-19 1 30998256
2016-01-20 -1 59051429
2016-01-21 1 30518939
2016-01-22 1 30495387
2016-01-25 1 32482015
2016-01-26 -1 26877080
2016-01-27 -1 58699359
2016-01-28 1 107475327
2016-01-29 1 62739548
2016-02-01 1 46132726", header = T)
一个纯粹的dplyr
方式是:
df %>%
na.omit() %>% # omit NA to not multiply by NA
mutate(isStep = (Signal - lag(Signal, 1)) != 0) %>% # Create a dummy variable for steps
mutate(isStep = ifelse(is.na(isStep), FALSE, isStep)) %>%
mutate(grp = cumsum(isStep)) %>% # create new ID based on steps
group_by(grp) %>% # group by before created steps
mutate(res = cumsum(Signal * Volume)) %>% # calculate value
select(x, Signal, Volume, res)
# # A tibble: 19 x 5
# # Groups: grp [6]
# grp x Signal Volume res
# <int> <fctr> <int> <int> <int>
# 1 0 2016-01-05 -1 23258238 -23258238
# 2 0 2016-01-06 -1 25096183 -48354421
# 3 0 2016-01-07 -1 45172906 -93527327
# 4 0 2016-01-08 -1 35402298 -128929625
# 5 0 2016-01-11 -1 29932385 -158862010
# 6 0 2016-01-12 -1 28395390 -187257400
# 7 0 2016-01-13 -1 33410553 -220667953
# 8 0 2016-01-14 -1 48658623 -269326576
# 9 1 2016-01-15 1 46132781 46132781
# 10 1 2016-01-19 1 30998256 77131037
# 11 2 2016-01-20 -1 59051429 -59051429
# 12 3 2016-01-21 1 30518939 30518939
# 13 3 2016-01-22 1 30495387 61014326
# 14 3 2016-01-25 1 32482015 93496341
# 15 4 2016-01-26 -1 26877080 -26877080
# 16 4 2016-01-27 -1 58699359 -85576439
# 17 5 2016-01-28 1 107475327 107475327
# 18 5 2016-01-29 1 62739548 170214875
# 19 5 2016-02-01 1 46132726 216347601
根据@docendo 的建议,这应该有效:
df[,cum := cumsum(Volume)*Signal,.(rleid(Signal))]
date Signal Volume cum
1: 2016-01-04 NA 37912403 NA
2: 2016-01-05 -1 23258238 -23258238
3: 2016-01-06 -1 25096183 -48354421
4: 2016-01-07 -1 45172906 -93527327
5: 2016-01-08 -1 35402298 -128929625
6: 2016-01-11 -1 29932385 -158862010
7: 2016-01-12 -1 28395390 -187257400
8: 2016-01-13 -1 33410553 -220667953
9: 2016-01-14 -1 48658623 -269326576
10: 2016-01-15 1 46132781 46132781
11: 2016-01-19 1 30998256 77131037