对组中的行求和,从出现特定值时开始

Sum rows in a group, starting when a specific value occurs

我想将一列的值累加到组的末尾,尽管在另一列中出现特定值时开始添加。我只对组内特定值的第一个实例感兴趣。因此,如果该值在组内再次出现,则添加列应继续添加值。我知道这听起来像是一个相当奇怪的问题,所以希望示例 table 有意义。

下面的数据框是我现在的:

> df = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0))

> df
   group numToAdd occurs
1      1        1      0
2      1        1      0
3      1        3      1
4      1        2      0
5      2        4      0
6      2        2      1
7      2        1      0
8      2        3      0
9      2        2      0
10     3        1      0
11     3        2      1
12     3        1      1
13     4        2      0
14     4        3      0
15     4        2      0

因此,每当组中出现 1 时,我想要 numToAdd 列中值的累加和,直到新组开始。这看起来像下面这样:

> finalDF = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd =    c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0),added = c(0,0,3,5,0,2,3,6,8,0,2,3,0,0,0))

> finalDF
   group numToAdd occurs added
1      1        1      0     0
2      1        1      0     0
3      1        3      1     3
4      1        2      0     5
5      2        4      0     0
6      2        2      1     2
7      2        1      0     3
8      2        3      0     6
9      2        2      0     8
10     3        1      0     0
11     3        2      1     2
12     3        1      1     3
13     4        2      0     0
14     4        3      0     0
15     4        2      0     0

因此,添加的列为 0,直到组内出现 1,然后累加 numToAdd 的值,直到移动到新组,将添加的列变回 0。在第三组中,值为 1第二次找到,但累计和仍在继续。此外,在第 4 组中,从未找到值 1,因此添加列中的值仍为 0。

我玩过 dplyr,但无法正常工作。下面的解法只输出总和,不输出每行递增的累计数

library(dplyr)
df = 
  df  %>%
  mutate(added=ifelse(occurs == 1,cumsum(numToAdd),0)) %>%
  group_by(group) 

尝试

 df %>% 
    group_by(group) %>%
    mutate(added= cumsum(numToAdd*cummax(occurs)))
 #      group numToAdd occurs added
 # 1      1        1      0     0
 # 2      1        1      0     0
 # 3      1        3      1     3
 # 4      1        2      0     5
 # 5      2        4      0     0
 # 6      2        2      1     2
 # 7      2        1      0     3
 # 8      2        3      0     6
 # 9      2        2      0     8
 # 10     3        1      0     0
 # 11     3        2      1     2
 # 12     3        1      1     3
 # 13     4        2      0     0
 # 14     4        3      0     0
 # 15     4        2      0     0

或使用data.table

 library(data.table)#v1.9.5+
 i1 <-setDT(df)[, .I[(rleid(occurs) + (occurs>0))>1], group]$V1
 df[, added:=0][i1, added:=cumsum(numToAdd), by = group]

或与 dplyr

中类似的选项
 setDT(df)[,added := cumsum(numToAdd * cummax(occurs)) , by = group]

您可以在 base R 中使用 split-apply-combine 类似的东西:

df$added <- unlist(lapply(split(df, df$group), function(x) {
  y <- rep(0, nrow(x))
  pos <- cumsum(x$occurs) > 0
  y[pos] <- cumsum(x$numToAdd[pos])
  y
}))
df
#    group numToAdd occurs added
# 1      1        1      0     0
# 2      1        1      0     0
# 3      1        3      1     3
# 4      1        2      0     5
# 5      2        4      0     0
# 6      2        2      1     2
# 7      2        1      0     3
# 8      2        3      0     6
# 9      2        2      0     8
# 10     3        1      0     0
# 11     3        2      1     2
# 12     3        1      1     3
# 13     4        2      0     0
# 14     4        3      0     0
# 15     4        2      0     0

要添加另一个 base R 方法:

df$added <- unlist(lapply(split(df, df$group), function(x) {
    c(x[,'occurs'][cumsum(x[,'occurs']) == 0L],
  cumsum(x[,'numToAdd'][cumsum(x[,'occurs']) != 0L]))
}))
#    group numToAdd occurs added
# 1      1        1      0     0
# 2      1        1      0     0
# 3      1        3      1     3
# 4      1        2      0     5
# 5      2        4      0     0
# 6      2        2      1     2
# 7      2        1      0     3
# 8      2        3      0     6
# 9      2        2      0     8
# 10     3        1      0     0
# 11     3        2      1     2
# 12     3        1      1     3
# 13     4        2      0     0
# 14     4        3      0     0
# 15     4        2      0     0

另一个基地R:

df$added <- unlist(lapply(split(df,df$group),function(x){
  cumsum((cumsum(x$occurs) > 0) * x$numToAdd)
}))