如何在R中以字符形式构建条件滚动序列
how to build conditional rolling sequence in character form in R
我有一个包含 4 列的数据框
g
表示数据中的某个组id
x
表示我必须对其执行计算的某些字符类型 id
date
是行动日期,用于其他一些分析。 (但是,我保留了相同的内容,以便可以保留记录的顺序)
action
是要执行的操作。此列有两个值 'add' 和 'reduce'。每个组的数量相等 g
。意味着如果有 5 条记录具有 'add',那么同一组中将有 5 条 'reduce'。记录为数据框中的有序因子。
基本上,我必须按照以下规则创建一个新列-
由于每个组中的第一个记录将始终是 'add' 因此可以在此处使用 x
的确切值
在第二个和以后的记录中,在每个组中,x
的值可能会粘贴到前一个记录,因为 'action' 值为 'add'
在第二个和以后的记录中,在每个组中,鉴于 'action' 值为 'reduce'
dput
的示例数据是
df <- structure(list(g = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L,
8L, 8L), x = c("1_", "1_", "2_", "2_", "2_", "1_", "2_", "1_",
"1_", "1_", "1_", "2_", "2_", "2_", "1_", "2_", "3_", "3_", "3_",
"3_", "4_", "4_", "3_", "3_", "3_", "3_", "3_", "3_", "4_", "5_",
"4_", "5_", "6_", "6_", "6_", "6_", "7_", "7_", "7_", "7_", "8_",
"8_", "8_", "8_"), date = structure(c(18262, 18264, 18265, 18266,
18271, 18275, 18275, 18276, 18277, 18279, 18280, 18283, 18286,
18287, 18288, 18291, 18262, 18264, 18275, 18276, 18277, 18288,
18275, 18283, 18291, 18297, 18301, 18309, 18366, 18374, 18375,
18381, 18309, 18319, 18328, 18347, 18364, 18367, 18303, 18309,
18328, 18341, 18341, 18344), class = "Date"), action = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("add",
"reduce"), class = c("ordered", "factor"))), row.names = c(NA,
-44L), groups = structure(list(g = 1:8, .rows = structure(list(
1:16, 17:22, 23:28, 29:32, 33:36, 37:38, 39:40, 41:44), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
df
# A tibble: 44 x 4
# Groups: g [8]
g x date action
<int> <chr> <date> <ord>
1 1 1_ 2020-01-01 add
2 1 1_ 2020-01-03 add
3 1 2_ 2020-01-04 add
4 1 2_ 2020-01-05 add
5 1 2_ 2020-01-10 reduce
6 1 1_ 2020-01-14 reduce
7 1 2_ 2020-01-14 reduce
8 1 1_ 2020-01-15 reduce
9 1 1_ 2020-01-16 add
10 1 1_ 2020-01-18 add
# ... with 34 more rows
desired result
可能像 -
head(as_tibble(df3_r),12)
# A tibble: 12 x 6
X g x date action result
<int> <int> <chr> <chr> <chr> <chr>
1 1 1 1_ 01-01-2020 add 1_
2 2 1 1_ 03-01-2020 add 1_1_
3 3 1 2_ 04-01-2020 add 1_1_2_
4 4 1 2_ 05-01-2020 add 1_1_2_2_
5 5 1 2_ 10-01-2020 reduce 1_1_2_
6 6 1 1_ 14-01-2020 reduce 1_2_
7 7 1 2_ 14-01-2020 reduce 1_
8 8 1 1_ 15-01-2020 reduce NA
9 9 1 1_ 16-01-2020 add 1_
10 10 1 1_ 18-01-2020 add 1_1_
11 11 1 1_ 19-01-2020 reduce 1_
12 12 1 2_ 22-01-2020 add 1_2_
提前致谢。
您可以使用 purrr
包中的 accumulate2
。
library(dplyr)
library(purrr)
df %>%
mutate(result = unlist(accumulate2(
x, action[-1L], ~if (..3 == "add") paste0(., .y) else sub(.y, "", .)
)))
输出
g x date action result
1 1 1_ 2020-01-01 add 1_
2 1 1_ 2020-01-03 add 1_1_
3 1 2_ 2020-01-04 add 1_1_2_
4 1 2_ 2020-01-05 add 1_1_2_2_
5 1 2_ 2020-01-10 reduce 1_1_2_
6 1 1_ 2020-01-14 reduce 1_2_
7 1 2_ 2020-01-14 reduce 1_
8 1 1_ 2020-01-15 reduce
9 1 1_ 2020-01-16 add 1_
10 1 1_ 2020-01-18 add 1_1_
11 1 1_ 2020-01-19 reduce 1_
12 1 2_ 2020-01-22 add 1_2_
13 1 2_ 2020-01-25 reduce 1_
14 1 2_ 2020-01-26 add 1_2_
15 1 1_ 2020-01-27 reduce 2_
16 1 2_ 2020-01-30 reduce
17 2 3_ 2020-01-01 add 3_
18 2 3_ 2020-01-03 add 3_3_
19 2 3_ 2020-01-14 reduce 3_
20 2 3_ 2020-01-15 reduce
21 2 4_ 2020-01-16 add 4_
22 2 4_ 2020-01-27 reduce
23 3 3_ 2020-01-14 add 3_
24 3 3_ 2020-01-22 reduce
25 3 3_ 2020-01-30 add 3_
26 3 3_ 2020-02-05 reduce
27 3 3_ 2020-02-09 add 3_
28 3 3_ 2020-02-17 reduce
29 4 4_ 2020-04-14 add 4_
30 4 5_ 2020-04-22 add 4_5_
31 4 4_ 2020-04-23 reduce 5_
32 4 5_ 2020-04-29 reduce
33 5 6_ 2020-02-17 add 6_
34 5 6_ 2020-02-27 add 6_6_
35 5 6_ 2020-03-07 reduce 6_
36 5 6_ 2020-03-26 reduce
37 6 7_ 2020-04-12 add 7_
38 6 7_ 2020-04-15 reduce
39 7 7_ 2020-02-11 add 7_
40 7 7_ 2020-02-17 reduce
41 8 8_ 2020-03-07 add 8_
42 8 8_ 2020-03-20 reduce
43 8 8_ 2020-03-20 add 8_
44 8 8_ 2020-03-23 reduce
我有一个包含 4 列的数据框
g
表示数据中的某个组id
x
表示我必须对其执行计算的某些字符类型 id
date
是行动日期,用于其他一些分析。 (但是,我保留了相同的内容,以便可以保留记录的顺序)
action
是要执行的操作。此列有两个值 'add' 和 'reduce'。每个组的数量相等 g
。意味着如果有 5 条记录具有 'add',那么同一组中将有 5 条 'reduce'。记录为数据框中的有序因子。
基本上,我必须按照以下规则创建一个新列-
由于每个组中的第一个记录将始终是 'add' 因此可以在此处使用
x
的确切值在第二个和以后的记录中,在每个组中,
x
的值可能会粘贴到前一个记录,因为 'action' 值为 'add'在第二个和以后的记录中,在每个组中,鉴于 'action' 值为 'reduce'
dput
的示例数据是
df <- structure(list(g = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L,
8L, 8L), x = c("1_", "1_", "2_", "2_", "2_", "1_", "2_", "1_",
"1_", "1_", "1_", "2_", "2_", "2_", "1_", "2_", "3_", "3_", "3_",
"3_", "4_", "4_", "3_", "3_", "3_", "3_", "3_", "3_", "4_", "5_",
"4_", "5_", "6_", "6_", "6_", "6_", "7_", "7_", "7_", "7_", "8_",
"8_", "8_", "8_"), date = structure(c(18262, 18264, 18265, 18266,
18271, 18275, 18275, 18276, 18277, 18279, 18280, 18283, 18286,
18287, 18288, 18291, 18262, 18264, 18275, 18276, 18277, 18288,
18275, 18283, 18291, 18297, 18301, 18309, 18366, 18374, 18375,
18381, 18309, 18319, 18328, 18347, 18364, 18367, 18303, 18309,
18328, 18341, 18341, 18344), class = "Date"), action = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("add",
"reduce"), class = c("ordered", "factor"))), row.names = c(NA,
-44L), groups = structure(list(g = 1:8, .rows = structure(list(
1:16, 17:22, 23:28, 29:32, 33:36, 37:38, 39:40, 41:44), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
df
# A tibble: 44 x 4
# Groups: g [8]
g x date action
<int> <chr> <date> <ord>
1 1 1_ 2020-01-01 add
2 1 1_ 2020-01-03 add
3 1 2_ 2020-01-04 add
4 1 2_ 2020-01-05 add
5 1 2_ 2020-01-10 reduce
6 1 1_ 2020-01-14 reduce
7 1 2_ 2020-01-14 reduce
8 1 1_ 2020-01-15 reduce
9 1 1_ 2020-01-16 add
10 1 1_ 2020-01-18 add
# ... with 34 more rows
desired result
可能像 -
head(as_tibble(df3_r),12)
# A tibble: 12 x 6
X g x date action result
<int> <int> <chr> <chr> <chr> <chr>
1 1 1 1_ 01-01-2020 add 1_
2 2 1 1_ 03-01-2020 add 1_1_
3 3 1 2_ 04-01-2020 add 1_1_2_
4 4 1 2_ 05-01-2020 add 1_1_2_2_
5 5 1 2_ 10-01-2020 reduce 1_1_2_
6 6 1 1_ 14-01-2020 reduce 1_2_
7 7 1 2_ 14-01-2020 reduce 1_
8 8 1 1_ 15-01-2020 reduce NA
9 9 1 1_ 16-01-2020 add 1_
10 10 1 1_ 18-01-2020 add 1_1_
11 11 1 1_ 19-01-2020 reduce 1_
12 12 1 2_ 22-01-2020 add 1_2_
提前致谢。
您可以使用 purrr
包中的 accumulate2
。
library(dplyr)
library(purrr)
df %>%
mutate(result = unlist(accumulate2(
x, action[-1L], ~if (..3 == "add") paste0(., .y) else sub(.y, "", .)
)))
输出
g x date action result
1 1 1_ 2020-01-01 add 1_
2 1 1_ 2020-01-03 add 1_1_
3 1 2_ 2020-01-04 add 1_1_2_
4 1 2_ 2020-01-05 add 1_1_2_2_
5 1 2_ 2020-01-10 reduce 1_1_2_
6 1 1_ 2020-01-14 reduce 1_2_
7 1 2_ 2020-01-14 reduce 1_
8 1 1_ 2020-01-15 reduce
9 1 1_ 2020-01-16 add 1_
10 1 1_ 2020-01-18 add 1_1_
11 1 1_ 2020-01-19 reduce 1_
12 1 2_ 2020-01-22 add 1_2_
13 1 2_ 2020-01-25 reduce 1_
14 1 2_ 2020-01-26 add 1_2_
15 1 1_ 2020-01-27 reduce 2_
16 1 2_ 2020-01-30 reduce
17 2 3_ 2020-01-01 add 3_
18 2 3_ 2020-01-03 add 3_3_
19 2 3_ 2020-01-14 reduce 3_
20 2 3_ 2020-01-15 reduce
21 2 4_ 2020-01-16 add 4_
22 2 4_ 2020-01-27 reduce
23 3 3_ 2020-01-14 add 3_
24 3 3_ 2020-01-22 reduce
25 3 3_ 2020-01-30 add 3_
26 3 3_ 2020-02-05 reduce
27 3 3_ 2020-02-09 add 3_
28 3 3_ 2020-02-17 reduce
29 4 4_ 2020-04-14 add 4_
30 4 5_ 2020-04-22 add 4_5_
31 4 4_ 2020-04-23 reduce 5_
32 4 5_ 2020-04-29 reduce
33 5 6_ 2020-02-17 add 6_
34 5 6_ 2020-02-27 add 6_6_
35 5 6_ 2020-03-07 reduce 6_
36 5 6_ 2020-03-26 reduce
37 6 7_ 2020-04-12 add 7_
38 6 7_ 2020-04-15 reduce
39 7 7_ 2020-02-11 add 7_
40 7 7_ 2020-02-17 reduce
41 8 8_ 2020-03-07 add 8_
42 8 8_ 2020-03-20 reduce
43 8 8_ 2020-03-20 add 8_
44 8 8_ 2020-03-23 reduce