创建组 ID 序列但省略特定行

Create group id sequence but omit specific rows

我有这个数据框:

df <- structure(list(value = c("+", "+", "+", "-", "+", "-", "+", "+", 
"+", "+", "-", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", 
"+", "+", "+", "+", "+")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -39L))

   value
   <chr>
 1 +    
 2 +    
 3 +    
 4 -    
 5 +    
 6 -    
 7 +    
 8 +    
 9 +    
10 +    
# ... with 29 more rows

我想为每 4 行添加一个分组 ID,例如:

df %>% 
  mutate(id = rep(row_number(), each=4, length.out = n()))

 value id
1      +  1
2      +  1
3      +  1
4      -  1
5      +  2
6      -  2
7      +  2
8      +  2
9      +  3
10     +  3
... and so on

但我希望省略带有 - 的行,例如:

df_expected
   value id expected_id
1      +  1           1
2      +  1           1
3      +  1           1
4      -  1           1
5      +  2           1
6      -  2           2
7      +  2           2
8      +  2           2
9      +  3           2
10     +  3           2
11     -  3           3
12     +  3           3
13     +  4           3
14     +  4           3
15     +  4           3
16     +  4           4
17     +  5           4
18     +  5           4
19     +  5           4
20     +  5           5
21     +  6           5
22     +  6           5
23     +  6           5
24     +  6           6
25     +  7           6
26     +  7           6
27     +  7           6
28     +  7           7
29     +  8           7
30     +  8           7
31     +  8           7
32     +  8           8
33     +  9           8
34     +  9           8
35     +  9           8
36     +  9           9
37     + 10           9
38     + 10           9
39     + 10           9

一个选项是对 'value' 不是 - 的行进行子集化,使用与 OP post 中相同的代码创建列 'id',然后 fill NAs

library(data.table)
library(dplyr)
library(tidyr)
 setDT(df)[value != "-", id := rep(seq_len(.N), each = 4, length.out = .N) ]
df %>%
   fill(id, .direction = "updown")

-输出

    value    id
    <char> <int>
 1:      +     1
 2:      +     1
 3:      +     1
 4:      -     1
 5:      +     1
 6:      -     2
 7:      +     2
 8:      +     2
 9:      +     2
10:      +     2
11:      -     3
12:      +     3
13:      +     3
14:      +     3
15:      +     3
16:      +     4
17:      +     4
18:      +     4
19:      +     4
20:      +     5
21:      +     5
22:      +     5
23:      +     5
24:      +     6
25:      +     6
26:      +     6
27:      +     6
28:      +     7
29:      +     7
30:      +     7
31:      +     7
32:      +     8
33:      +     8
34:      +     8
35:      +     8
36:      +     9
37:      +     9
38:      +     9
39:      +     9

或使用 dplyrtidyr 中的 fill - 而不是在整个序列上创建 rep,子集 row_number() 其中 'value' 不等于 '-',并使用 replace 仅分配给具有 rep 输出

的那些元素
df %>% 
   mutate(id =  replace(rep(NA_integer_, n()), value != "-", 
     rep(row_number()[value != "-"], each = 4, 
        length.out = sum(value != "-")))) %>% 
   fill(id, .direction = "updown")

这是另一种有趣的方法:

setDT(df)[value!="-",id:=rep(1:(.N/4),each=4)][,id:=fifelse(is.na(id), ceiling(.I/4),id)]

输出:

    value id
 1:     +  1
 2:     +  1
 3:     +  1
 4:     -  1
 5:     +  1
 6:     -  2
 7:     +  2
 8:     +  2
 9:     +  2
10:     +  2
11:     -  3
12:     +  3
13:     +  3
14:     +  3
15:     +  3
16:     +  4
17:     +  4
18:     +  4
19:     +  4
20:     +  5
21:     +  5
22:     +  5
23:     +  5
24:     +  6
25:     +  6
26:     +  6
27:     +  6
28:     +  7
29:     +  7
30:     +  7
31:     +  7
32:     +  8
33:     +  8
34:     +  8
35:     +  8
36:     +  9
37:     +  9
38:     +  9
39:     +  9

一个简单的 one-liner 与 MESS::cumsumbinning:

df$id <- MESS:cumsumbinning(value == "+", 3, cutwhenpassed = T)

all.equal(df_expected$expected_id, df$id)
# [1] TRUE