使用连续计数更新分组变量
Update a grouping variable with continuous counts
我有一个分组变量 Seq
需要更新:
df
Line Seq Q
21 649 9 q_misc
22 650 9 <NA>
23 651 9 <NA>
24 670 10 q_misc
25 682 0 q_misc
26 898 0 q_misc
27 899 0 <NA>
28 900 0 <NA>
29 901 0 <NA>
30 1009 15 q_misc
31 1010 15 <NA>
32 1011 15 <NA>
33 1283 22 q_misc
34 1467 0 q_misc
35 1468 0 <NA>
36 1649 0 q_misc
37 1650 0 <NA>
38 1651 0 <NA>
39 1652 0 <NA>
40 1653 0 <NA>
我需要 Seq
从 1
开始,并在 Q
列中有新的 q_misc
标记时分配下一个连续数字。
期望的输出:
df
Line Seq Q
21 649 1 q_misc
22 650 1 <NA>
23 651 1 <NA>
24 670 2 q_misc
25 682 3 q_misc
26 898 4 q_misc
27 899 4 <NA>
28 900 4 <NA>
29 901 4 <NA>
30 1009 5 q_misc
31 1010 5 <NA>
32 1011 5 <NA>
33 1283 6 q_misc
34 1467 7 q_misc
35 1468 7 <NA>
36 1649 8 q_misc
37 1650 8 <NA>
38 1651 8 <NA>
39 1652 8 <NA>
40 1653 8 <NA>
如能指点如何做到这一点,我将不胜感激。
可重现数据:
df <- structure(list(Line = c(649L, 650L, 651L, 670L, 682L, 898L, 899L,
900L, 901L, 1009L, 1010L, 1011L, 1283L, 1467L, 1468L, 1649L,
1650L, 1651L, 1652L, 1653L), Seq = c(9L, 9L, 9L, 10L, 0L, 0L,
0L, 0L, 0L, 15L, 15L, 15L, 22L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Q = c("q_misc", NA, NA, "q_misc", "q_misc", "q_misc", NA,
NA, NA, "q_misc", NA, NA, "q_misc", "q_misc", NA, "q_misc",
NA, NA, NA, NA)), row.names = 21:40, class = "data.frame")
您可以在将 NA
替换为空值(或 'q_misc'
以外的任何值)后使用 cumsum
。
transform(df, Seq = cumsum(replace(Q, is.na(Q), '') == 'q_misc'))
# Line Seq Q
#21 649 1 q_misc
#22 650 1 <NA>
#23 651 1 <NA>
#24 670 2 q_misc
#25 682 3 q_misc
#26 898 4 q_misc
#27 899 4 <NA>
#28 900 4 <NA>
#29 901 4 <NA>
#30 1009 5 q_misc
#31 1010 5 <NA>
#32 1011 5 <NA>
#33 1283 6 q_misc
#34 1467 7 q_misc
#35 1468 7 <NA>
#36 1649 8 q_misc
#37 1650 8 <NA>
#38 1651 8 <NA>
#39 1652 8 <NA>
#40 1653 8 <NA>
在dplyr
library(dplyr)
df %>% mutate(Seq = cumsum(replace(Q, is.na(Q), '') == 'q_misc'))
如果 Q
列只有 NA
和 'q_misc'
值,您也可以使用。
df %>% mutate(Seq = cumsum(!is.na(Q)))
使用%in%
library(dplyr)
library(tidyr)
df %>%
mutate(Seq = cumsum(Q %in% 'q_misc'))
-输出
Line Seq Q
21 649 1 q_misc
22 650 1 <NA>
23 651 1 <NA>
24 670 2 q_misc
25 682 3 q_misc
26 898 4 q_misc
27 899 4 <NA>
28 900 4 <NA>
29 901 4 <NA>
30 1009 5 q_misc
31 1010 5 <NA>
32 1011 5 <NA>
33 1283 6 q_misc
34 1467 7 q_misc
35 1468 7 <NA>
36 1649 8 q_misc
37 1650 8 <NA>
38 1651 8 <NA>
39 1652 8 <NA>
40 1653 8 <NA>
我有一个分组变量 Seq
需要更新:
df
Line Seq Q
21 649 9 q_misc
22 650 9 <NA>
23 651 9 <NA>
24 670 10 q_misc
25 682 0 q_misc
26 898 0 q_misc
27 899 0 <NA>
28 900 0 <NA>
29 901 0 <NA>
30 1009 15 q_misc
31 1010 15 <NA>
32 1011 15 <NA>
33 1283 22 q_misc
34 1467 0 q_misc
35 1468 0 <NA>
36 1649 0 q_misc
37 1650 0 <NA>
38 1651 0 <NA>
39 1652 0 <NA>
40 1653 0 <NA>
我需要 Seq
从 1
开始,并在 Q
列中有新的 q_misc
标记时分配下一个连续数字。
期望的输出:
df
Line Seq Q
21 649 1 q_misc
22 650 1 <NA>
23 651 1 <NA>
24 670 2 q_misc
25 682 3 q_misc
26 898 4 q_misc
27 899 4 <NA>
28 900 4 <NA>
29 901 4 <NA>
30 1009 5 q_misc
31 1010 5 <NA>
32 1011 5 <NA>
33 1283 6 q_misc
34 1467 7 q_misc
35 1468 7 <NA>
36 1649 8 q_misc
37 1650 8 <NA>
38 1651 8 <NA>
39 1652 8 <NA>
40 1653 8 <NA>
如能指点如何做到这一点,我将不胜感激。
可重现数据:
df <- structure(list(Line = c(649L, 650L, 651L, 670L, 682L, 898L, 899L,
900L, 901L, 1009L, 1010L, 1011L, 1283L, 1467L, 1468L, 1649L,
1650L, 1651L, 1652L, 1653L), Seq = c(9L, 9L, 9L, 10L, 0L, 0L,
0L, 0L, 0L, 15L, 15L, 15L, 22L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Q = c("q_misc", NA, NA, "q_misc", "q_misc", "q_misc", NA,
NA, NA, "q_misc", NA, NA, "q_misc", "q_misc", NA, "q_misc",
NA, NA, NA, NA)), row.names = 21:40, class = "data.frame")
您可以在将 NA
替换为空值(或 'q_misc'
以外的任何值)后使用 cumsum
。
transform(df, Seq = cumsum(replace(Q, is.na(Q), '') == 'q_misc'))
# Line Seq Q
#21 649 1 q_misc
#22 650 1 <NA>
#23 651 1 <NA>
#24 670 2 q_misc
#25 682 3 q_misc
#26 898 4 q_misc
#27 899 4 <NA>
#28 900 4 <NA>
#29 901 4 <NA>
#30 1009 5 q_misc
#31 1010 5 <NA>
#32 1011 5 <NA>
#33 1283 6 q_misc
#34 1467 7 q_misc
#35 1468 7 <NA>
#36 1649 8 q_misc
#37 1650 8 <NA>
#38 1651 8 <NA>
#39 1652 8 <NA>
#40 1653 8 <NA>
在dplyr
library(dplyr)
df %>% mutate(Seq = cumsum(replace(Q, is.na(Q), '') == 'q_misc'))
如果 Q
列只有 NA
和 'q_misc'
值,您也可以使用。
df %>% mutate(Seq = cumsum(!is.na(Q)))
使用%in%
library(dplyr)
library(tidyr)
df %>%
mutate(Seq = cumsum(Q %in% 'q_misc'))
-输出
Line Seq Q
21 649 1 q_misc
22 650 1 <NA>
23 651 1 <NA>
24 670 2 q_misc
25 682 3 q_misc
26 898 4 q_misc
27 899 4 <NA>
28 900 4 <NA>
29 901 4 <NA>
30 1009 5 q_misc
31 1010 5 <NA>
32 1011 5 <NA>
33 1283 6 q_misc
34 1467 7 q_misc
35 1468 7 <NA>
36 1649 8 q_misc
37 1650 8 <NA>
38 1651 8 <NA>
39 1652 8 <NA>
40 1653 8 <NA>