创建组 ID 序列但省略特定行
Create group id sequence but omit specific rows
我有这个数据框:
df <- structure(list(value = c("+", "+", "+", "-", "+", "-", "+", "+",
"+", "+", "-", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -39L))
value
<chr>
1 +
2 +
3 +
4 -
5 +
6 -
7 +
8 +
9 +
10 +
# ... with 29 more rows
我想为每 4 行添加一个分组 ID,例如:
df %>%
mutate(id = rep(row_number(), each=4, length.out = n()))
value id
1 + 1
2 + 1
3 + 1
4 - 1
5 + 2
6 - 2
7 + 2
8 + 2
9 + 3
10 + 3
... and so on
但我希望省略带有 -
的行,例如:
df_expected
value id expected_id
1 + 1 1
2 + 1 1
3 + 1 1
4 - 1 1
5 + 2 1
6 - 2 2
7 + 2 2
8 + 2 2
9 + 3 2
10 + 3 2
11 - 3 3
12 + 3 3
13 + 4 3
14 + 4 3
15 + 4 3
16 + 4 4
17 + 5 4
18 + 5 4
19 + 5 4
20 + 5 5
21 + 6 5
22 + 6 5
23 + 6 5
24 + 6 6
25 + 7 6
26 + 7 6
27 + 7 6
28 + 7 7
29 + 8 7
30 + 8 7
31 + 8 7
32 + 8 8
33 + 9 8
34 + 9 8
35 + 9 8
36 + 9 9
37 + 10 9
38 + 10 9
39 + 10 9
一个选项是对 'value' 不是 -
的行进行子集化,使用与 OP post 中相同的代码创建列 'id',然后 fill
NAs
library(data.table)
library(dplyr)
library(tidyr)
setDT(df)[value != "-", id := rep(seq_len(.N), each = 4, length.out = .N) ]
df %>%
fill(id, .direction = "updown")
-输出
value id
<char> <int>
1: + 1
2: + 1
3: + 1
4: - 1
5: + 1
6: - 2
7: + 2
8: + 2
9: + 2
10: + 2
11: - 3
12: + 3
13: + 3
14: + 3
15: + 3
16: + 4
17: + 4
18: + 4
19: + 4
20: + 5
21: + 5
22: + 5
23: + 5
24: + 6
25: + 6
26: + 6
27: + 6
28: + 7
29: + 7
30: + 7
31: + 7
32: + 8
33: + 8
34: + 8
35: + 8
36: + 9
37: + 9
38: + 9
39: + 9
或使用 dplyr
和 tidyr
中的 fill
- 而不是在整个序列上创建 rep
,子集 row_number()
其中 'value' 不等于 '-',并使用 replace
仅分配给具有 rep
输出
的那些元素
df %>%
mutate(id = replace(rep(NA_integer_, n()), value != "-",
rep(row_number()[value != "-"], each = 4,
length.out = sum(value != "-")))) %>%
fill(id, .direction = "updown")
这是另一种有趣的方法:
setDT(df)[value!="-",id:=rep(1:(.N/4),each=4)][,id:=fifelse(is.na(id), ceiling(.I/4),id)]
输出:
value id
1: + 1
2: + 1
3: + 1
4: - 1
5: + 1
6: - 2
7: + 2
8: + 2
9: + 2
10: + 2
11: - 3
12: + 3
13: + 3
14: + 3
15: + 3
16: + 4
17: + 4
18: + 4
19: + 4
20: + 5
21: + 5
22: + 5
23: + 5
24: + 6
25: + 6
26: + 6
27: + 6
28: + 7
29: + 7
30: + 7
31: + 7
32: + 8
33: + 8
34: + 8
35: + 8
36: + 9
37: + 9
38: + 9
39: + 9
一个简单的 one-liner 与 MESS::cumsumbinning
:
df$id <- MESS:cumsumbinning(value == "+", 3, cutwhenpassed = T)
all.equal(df_expected$expected_id, df$id)
# [1] TRUE
我有这个数据框:
df <- structure(list(value = c("+", "+", "+", "-", "+", "-", "+", "+",
"+", "+", "-", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",
"+", "+", "+", "+", "+")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -39L))
value
<chr>
1 +
2 +
3 +
4 -
5 +
6 -
7 +
8 +
9 +
10 +
# ... with 29 more rows
我想为每 4 行添加一个分组 ID,例如:
df %>%
mutate(id = rep(row_number(), each=4, length.out = n()))
value id
1 + 1
2 + 1
3 + 1
4 - 1
5 + 2
6 - 2
7 + 2
8 + 2
9 + 3
10 + 3
... and so on
但我希望省略带有 -
的行,例如:
df_expected
value id expected_id
1 + 1 1
2 + 1 1
3 + 1 1
4 - 1 1
5 + 2 1
6 - 2 2
7 + 2 2
8 + 2 2
9 + 3 2
10 + 3 2
11 - 3 3
12 + 3 3
13 + 4 3
14 + 4 3
15 + 4 3
16 + 4 4
17 + 5 4
18 + 5 4
19 + 5 4
20 + 5 5
21 + 6 5
22 + 6 5
23 + 6 5
24 + 6 6
25 + 7 6
26 + 7 6
27 + 7 6
28 + 7 7
29 + 8 7
30 + 8 7
31 + 8 7
32 + 8 8
33 + 9 8
34 + 9 8
35 + 9 8
36 + 9 9
37 + 10 9
38 + 10 9
39 + 10 9
一个选项是对 'value' 不是 -
的行进行子集化,使用与 OP post 中相同的代码创建列 'id',然后 fill
NAs
library(data.table)
library(dplyr)
library(tidyr)
setDT(df)[value != "-", id := rep(seq_len(.N), each = 4, length.out = .N) ]
df %>%
fill(id, .direction = "updown")
-输出
value id
<char> <int>
1: + 1
2: + 1
3: + 1
4: - 1
5: + 1
6: - 2
7: + 2
8: + 2
9: + 2
10: + 2
11: - 3
12: + 3
13: + 3
14: + 3
15: + 3
16: + 4
17: + 4
18: + 4
19: + 4
20: + 5
21: + 5
22: + 5
23: + 5
24: + 6
25: + 6
26: + 6
27: + 6
28: + 7
29: + 7
30: + 7
31: + 7
32: + 8
33: + 8
34: + 8
35: + 8
36: + 9
37: + 9
38: + 9
39: + 9
或使用 dplyr
和 tidyr
中的 fill
- 而不是在整个序列上创建 rep
,子集 row_number()
其中 'value' 不等于 '-',并使用 replace
仅分配给具有 rep
输出
df %>%
mutate(id = replace(rep(NA_integer_, n()), value != "-",
rep(row_number()[value != "-"], each = 4,
length.out = sum(value != "-")))) %>%
fill(id, .direction = "updown")
这是另一种有趣的方法:
setDT(df)[value!="-",id:=rep(1:(.N/4),each=4)][,id:=fifelse(is.na(id), ceiling(.I/4),id)]
输出:
value id
1: + 1
2: + 1
3: + 1
4: - 1
5: + 1
6: - 2
7: + 2
8: + 2
9: + 2
10: + 2
11: - 3
12: + 3
13: + 3
14: + 3
15: + 3
16: + 4
17: + 4
18: + 4
19: + 4
20: + 5
21: + 5
22: + 5
23: + 5
24: + 6
25: + 6
26: + 6
27: + 6
28: + 7
29: + 7
30: + 7
31: + 7
32: + 8
33: + 8
34: + 8
35: + 8
36: + 9
37: + 9
38: + 9
39: + 9
一个简单的 one-liner 与 MESS::cumsumbinning
:
df$id <- MESS:cumsumbinning(value == "+", 3, cutwhenpassed = T)
all.equal(df_expected$expected_id, df$id)
# [1] TRUE