r 在 dplyr accumulate 函数上添加一个分组
r adding a group by on dplyr accumulate function
对于如下的测试数据集,
testdf1 <- structure(list(ShinyUsrId = c(16338, 16338, 16338, 16338, 16338),
ButtonId = c(11, 12, 11, 11, 14), V3 = c(1519971165,
1520741372, 1520741372, 1521094311, 1513576204),
timediff = c(0, 770207, 0, 0, 0)), .Names = c("ShinyUsrId",
"ButtonId", "V3", "timediff"), row.names = c(NA, -5L), class =
"data.frame")
ShinyUsrId ButtonId Button_PressDate time_diff
16338 11 2018-03-02 06:12:45 0
16338 12 2018-03-11 04:09:32 770207
16338 11 2018-03-11 04:09:32 0
16338 11 2018-03-15 06:11:51 0
16338 14 2017-12-18 05:50:04 0
下面的代码折叠了时差 (timediff) 小于 60 秒的后续行。
g <- 0
Collpase_testdf1 <- mutate(testdf1, date_groups =
accumulate(testdf1$timediff, function(x, y)
if (y - x < 60)
g
else {
g <<- g + 1
})) %>%
group_by(date_groups) %>%
summarise(
ButtonId = paste(ButtonId , collapse = ", "),
ShinyUsrId = paste(ShinyUsrId, collapse = ", "),
time_diff = paste(timediff, collapse = ", ")
)
所以输出看起来像这样,符合预期。
date_groups ButtonId ShinyUsrId time_diff
0 11 16338 0
1 12, 11, 11, 14 16338, 16338, 16338, 16338 770207, 0, 0, 0
现在,如果我的数据集包含不同的 UserId,如下所示
testdf1 <- structure(list(ShinyUsrId = c(16338, 16338, 15148, 84756,
84756),
ButtonId = c(11, 12, 11, 11, 14), V3 = c(1519971165,
1520741372, 1520741372, 1521094311, 1513576204),
timediff = c(0, 770207, 0, 0, 0)), .Names = c("ShinyUsrId",
"ButtonId", "V3", "timediff"), row.names = c(NA, -5L), class =
"data.frame")
ShinyUsrId ButtonId Button_PressDate time_diff
16338 11 2018-03-02 06:12:45 0
16338 11 2018-03-11 04:09:32 770207
15148 11 2018-03-11 04:09:32 0
84756 11 2018-03-15 06:11:51 0
84756 11 2017-12-18 05:50:04 0
如何包含 group_by 参数,以便我的输出如下所示
date_groups ButtonId ShinyUsrId time_diff
0 11 16338 0
1 12, 16338 770207
2 11 15148 0
3 11,14 84756 0,0
我知道我可以使用 for 循环来做到这一点,但我很好奇如何使用 group_by 参数来做到这一点?
测试数据集 2
testdf1 <- structure(list(ShinyUsrId = c(1765, 1765, 1765, 1765,
1765),
ButtonId = c(18, 18, 17, 17, 121), V3 = c(1519971165,
1520741372, 1520741372, 1521094311, 1513576204),
timediff = c(0, 880, 3502, 13148814, 1210)), .Names = c("ShinyUsrId",
"ButtonId", "V3", "timediff"), row.names = c(NA, -5L), class =
"data.frame")
ShinyUsrId ButtonId Button_PressDate time_diff
1765 18 2018-03-02 06:12:45 0
1765 18 2018-03-11 04:09:32 880
1765 17 2018-03-11 04:09:32 3502
1765 17 2018-03-15 06:11:51 13148814
1765 121 2017-12-18 05:50:04 1210
预期输出
date_groups ButtonId ShinyUsrId time_diff
0 18 1765 0
1 18 1765 880
2 17 1765 3502
3 17 1765 13148814
4 121 1765 1210
测试数据集场景 3
testdf1 <- structure(list(ShinyUsrId = c(13679, 13679, 13679, 13679,13679,13679, 13679, 13679, 13679,13679, 13679,13679),
ButtonId = c(23, 184, 184, 23, 184,184,23,23,184,184,184,23),
ButtonPressDate <- lubridate::ymd_hms(c('2017-11-05 06:34:59', '2017-11-05 06:34:59', '2017-12-07 00:27:53', '2017-12-07 00:53:47', '2017-12-07 01:03:05','2018-03-08 00:28:09', '2018-03-08 00:28:09', '2018-03-08 00:45:02', '2018-03-08 00:45:02', '2018-03-24 13:13:15','2018-05-05 06:22:57', '2018-05-05 06:22:57')),
timediff = c(0, 0, 2742774, 1554, 558, 7860304, 0, 1013, 0, 1427293, 3604182, 0)),
.Names = c("ShinyUsrId","ButtonId", "V3", "timediff"), row.names = c(NA, -12L), class = "data.frame")
ShinyUsrId ButtonId ButtonPressDate timediff
13679 23 2017-11-05 06:34:59 0
13679 184 2017-11-05 06:34:59 0
13679 184 2017-12-07 00:27:53 2742774
13679 23 2017-12-07 00:53:47 1554
13679 184 2017-12-07 01:03:05 558
13679 184 2018-03-08 00:28:09 7860304
13679 23 2018-03-08 00:28:09 0
13679 23 2018-03-08 00:45:02 1013
13679 184 2018-03-08 00:45:02 0
13679 184 2018-03-24 13:13:15 1427293
13679 184 2018-05-05 06:22:57 3604182
13679 23 2018-05-05 06:22:57 0
预期输出
date_groups ButtonId ShinyUsrId timediff
0 23, 184 13679,13679 0,0
1 184 13679 2742774
2 23 13679 1554
3 184 13679 558
4 184, 23 13679, 13679 7860304, 0
5 23, 184 13679, 13679 1013, 0
6 184 13679 1427293
7 184, 23 13679, 13679 3604182, 0
由于您的函数使用 <<-
运算符,我不确定您是否可以直接执行此操作。一种方法是通过 ShinyUsrId
将 data.frame
拆分为 data.frames
的 list
,然后使用 map_dfr()
,但是如何摆脱 <<-
和 accumulate()
利用 lag()
?
Collpase_testdf1 <- testdf1 %>%
group_by(ShinyUsrId) %>%
mutate(date_groups = cumsum(time_diff - lag(time_diff, default = 0) > 60)) %>%
group_by(ShinyUsrId, date_groups) %>%
summarise(
ButtonId = paste(ButtonId , collapse = ", "),
time_diff = paste(timediff, collapse = ", ")
)
对于如下的测试数据集,
testdf1 <- structure(list(ShinyUsrId = c(16338, 16338, 16338, 16338, 16338),
ButtonId = c(11, 12, 11, 11, 14), V3 = c(1519971165,
1520741372, 1520741372, 1521094311, 1513576204),
timediff = c(0, 770207, 0, 0, 0)), .Names = c("ShinyUsrId",
"ButtonId", "V3", "timediff"), row.names = c(NA, -5L), class =
"data.frame")
ShinyUsrId ButtonId Button_PressDate time_diff
16338 11 2018-03-02 06:12:45 0
16338 12 2018-03-11 04:09:32 770207
16338 11 2018-03-11 04:09:32 0
16338 11 2018-03-15 06:11:51 0
16338 14 2017-12-18 05:50:04 0
下面的代码折叠了时差 (timediff) 小于 60 秒的后续行。
g <- 0
Collpase_testdf1 <- mutate(testdf1, date_groups =
accumulate(testdf1$timediff, function(x, y)
if (y - x < 60)
g
else {
g <<- g + 1
})) %>%
group_by(date_groups) %>%
summarise(
ButtonId = paste(ButtonId , collapse = ", "),
ShinyUsrId = paste(ShinyUsrId, collapse = ", "),
time_diff = paste(timediff, collapse = ", ")
)
所以输出看起来像这样,符合预期。
date_groups ButtonId ShinyUsrId time_diff
0 11 16338 0
1 12, 11, 11, 14 16338, 16338, 16338, 16338 770207, 0, 0, 0
现在,如果我的数据集包含不同的 UserId,如下所示
testdf1 <- structure(list(ShinyUsrId = c(16338, 16338, 15148, 84756,
84756),
ButtonId = c(11, 12, 11, 11, 14), V3 = c(1519971165,
1520741372, 1520741372, 1521094311, 1513576204),
timediff = c(0, 770207, 0, 0, 0)), .Names = c("ShinyUsrId",
"ButtonId", "V3", "timediff"), row.names = c(NA, -5L), class =
"data.frame")
ShinyUsrId ButtonId Button_PressDate time_diff
16338 11 2018-03-02 06:12:45 0
16338 11 2018-03-11 04:09:32 770207
15148 11 2018-03-11 04:09:32 0
84756 11 2018-03-15 06:11:51 0
84756 11 2017-12-18 05:50:04 0
如何包含 group_by 参数,以便我的输出如下所示
date_groups ButtonId ShinyUsrId time_diff
0 11 16338 0
1 12, 16338 770207
2 11 15148 0
3 11,14 84756 0,0
我知道我可以使用 for 循环来做到这一点,但我很好奇如何使用 group_by 参数来做到这一点?
测试数据集 2
testdf1 <- structure(list(ShinyUsrId = c(1765, 1765, 1765, 1765,
1765),
ButtonId = c(18, 18, 17, 17, 121), V3 = c(1519971165,
1520741372, 1520741372, 1521094311, 1513576204),
timediff = c(0, 880, 3502, 13148814, 1210)), .Names = c("ShinyUsrId",
"ButtonId", "V3", "timediff"), row.names = c(NA, -5L), class =
"data.frame")
ShinyUsrId ButtonId Button_PressDate time_diff
1765 18 2018-03-02 06:12:45 0
1765 18 2018-03-11 04:09:32 880
1765 17 2018-03-11 04:09:32 3502
1765 17 2018-03-15 06:11:51 13148814
1765 121 2017-12-18 05:50:04 1210
预期输出
date_groups ButtonId ShinyUsrId time_diff
0 18 1765 0
1 18 1765 880
2 17 1765 3502
3 17 1765 13148814
4 121 1765 1210
测试数据集场景 3
testdf1 <- structure(list(ShinyUsrId = c(13679, 13679, 13679, 13679,13679,13679, 13679, 13679, 13679,13679, 13679,13679),
ButtonId = c(23, 184, 184, 23, 184,184,23,23,184,184,184,23),
ButtonPressDate <- lubridate::ymd_hms(c('2017-11-05 06:34:59', '2017-11-05 06:34:59', '2017-12-07 00:27:53', '2017-12-07 00:53:47', '2017-12-07 01:03:05','2018-03-08 00:28:09', '2018-03-08 00:28:09', '2018-03-08 00:45:02', '2018-03-08 00:45:02', '2018-03-24 13:13:15','2018-05-05 06:22:57', '2018-05-05 06:22:57')),
timediff = c(0, 0, 2742774, 1554, 558, 7860304, 0, 1013, 0, 1427293, 3604182, 0)),
.Names = c("ShinyUsrId","ButtonId", "V3", "timediff"), row.names = c(NA, -12L), class = "data.frame")
ShinyUsrId ButtonId ButtonPressDate timediff
13679 23 2017-11-05 06:34:59 0
13679 184 2017-11-05 06:34:59 0
13679 184 2017-12-07 00:27:53 2742774
13679 23 2017-12-07 00:53:47 1554
13679 184 2017-12-07 01:03:05 558
13679 184 2018-03-08 00:28:09 7860304
13679 23 2018-03-08 00:28:09 0
13679 23 2018-03-08 00:45:02 1013
13679 184 2018-03-08 00:45:02 0
13679 184 2018-03-24 13:13:15 1427293
13679 184 2018-05-05 06:22:57 3604182
13679 23 2018-05-05 06:22:57 0
预期输出
date_groups ButtonId ShinyUsrId timediff
0 23, 184 13679,13679 0,0
1 184 13679 2742774
2 23 13679 1554
3 184 13679 558
4 184, 23 13679, 13679 7860304, 0
5 23, 184 13679, 13679 1013, 0
6 184 13679 1427293
7 184, 23 13679, 13679 3604182, 0
由于您的函数使用 <<-
运算符,我不确定您是否可以直接执行此操作。一种方法是通过 ShinyUsrId
将 data.frame
拆分为 data.frames
的 list
,然后使用 map_dfr()
,但是如何摆脱 <<-
和 accumulate()
利用 lag()
?
Collpase_testdf1 <- testdf1 %>%
group_by(ShinyUsrId) %>%
mutate(date_groups = cumsum(time_diff - lag(time_diff, default = 0) > 60)) %>%
group_by(ShinyUsrId, date_groups) %>%
summarise(
ButtonId = paste(ButtonId , collapse = ", "),
time_diff = paste(timediff, collapse = ", ")
)