结合基于多个条件的观察
Combining observations based on multiple conditions
目前我正在写我的硕士论文,但是,我在多个条件下合并行时遇到了一些问题。我在下面说明了我的问题和期望的结果。我希望你能帮助我 :).
这是我的数据集的示例:
df <- data.frame(
userID = c(1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3),
sessionID = c(1, 2, 3, 4, 5, 1, 2, 1, 2, 3, 4),
date = as.Date(c("2019-03-15", "2019-03-18", "2019-03-19", "2019-03-21","2019-03-30", "2019-04-05",
"2019-06-06", "2019-11-22", "2019-12-22", "2019-12-24", "2020-01-15"),
format = "%Y-%m-%d"),
purchase=c(0,1,0,0,0,0,0,0,0,1,0))
现在,我已经通过 dplyr 的差异计算了差异:
library(dplyr)
df <- df %>%
group_by(userID) %>%
mutate(diff = date - lag(date))
但是,如果它们之间的差异小于 10 天,我想合并这些行。我希望 10 天 window 每次出现 activity(新的 sessionID)时都会重置。另外,purchase为1则停止,第10天window会在有新sessionID时重新开始
我在 dplyr 中尝试了很多使用函数 filter 和 summarize 的东西,但它没有给出想要的结果。另外,我真的不知道怎么加上购买条件。
我想要的结果是这样的:
df2 <- data.frame(
userID = c(1, 1, 2, 2, 3, 3, 3),
sessionID = c("1 + 2", "3 + 4 + 5", "1", "2", "1", "2 + 3", "4"),
date.start = as.Date(c("2019-03-15","2019-03-19", "2019-04-05",
"2019-06-06", "2019-11-22", "2019-12-22", "2020-01-15"),
format = "%Y-%m-%d"),
date.end = as.Date(c("2019-03-18", "2019-03-30", "2019-04-05", "2019-06-06",
"2019-11-22", "2019-12-24", "2020-01-15"), format = "%Y-%m-%d"),
purchase=c(1,0,0,0,0,1,0))
希望你能帮助我:)
提前致谢!
按'userID'分组,通过对'purchase'的lag
进行累加,根据'purchase'中出现1的次数创建一个新组,然后创建另一个基于相邻 'date' 值中的 diff
erence 的分组,即检查差异是否大于或等于 10 天,进行累加和,然后 summarise
'sessionID'通过粘贴(str_c
),得到'date'的first
元素和'date'的last
和'purchase'中1的any
值] 作为摘要列
library(dplyr)
library(stringr)
df %>%
group_by(userID) %>%
group_by( grp = cumsum(lag(purchase,
default = first(purchase))), .add = TRUE) %>%
group_by(cat = cumsum(difftime(date,
lag(date, default = first(date)), units = 'day') >= 10), .add = TRUE ) %>%
summarise(sessionID = str_c(sessionID, collapse= ' + '),
date.start = first(date), date.end = last(date),
purchase = +(any(purchase == 1)), .groups = 'drop' ) %>%
select(-grp, -cat)
-输出
# A tibble: 7 x 5
userID sessionID date.start date.end purchase
<dbl> <chr> <date> <date> <int>
1 1 1 + 2 2019-03-15 2019-03-18 1
2 1 3 + 4 + 5 2019-03-19 2019-03-30 0
3 2 1 2019-04-05 2019-04-05 0
4 2 2 2019-06-06 2019-06-06 0
5 3 1 2019-11-22 2019-11-22 0
6 3 2 + 3 2019-12-22 2019-12-24 1
7 3 4 2020-01-15 2020-01-15 0
献给我亲爱的朋友@akrun
这只是实现最终输出的另一种方式,它不像亲爱的@akrun 提议的那样优雅和简洁。事实上,我在这上面花了几个小时,看到它的结尾对我来说非常重要。然而,我一如既往地受到亲爱的@Akrun 的启发。我希望它对你有用:
library(dplyr)
library(purrr)
df %>%
mutate(cum = cumsum(purchase == 1),
cum = ifelse(cum - lag(cum, default = 0) == 1, lag(cum), cum),
Days = as.numeric(date - lag(date, default = first(date)))) %>%
group_by(cum) %>%
mutate(diff = ifelse(Days < 10, 0, 1)) %>%
ungroup() %>%
mutate(diff = cumsum(diff),
start = date,
end = date) %>%
mutate(across(sessionID, as.character)) %>%
group_split(userID, cum, diff) %>%
map_dfr(~ add_row(.x, userID = .x$userID[1],
sessionID = paste(.x$sessionID, collapse = "+"),
start = .x$date[1], end = .x$date[length(.x$date)])) %>%
filter(if_any(date:diff, ~ is.na(.x))) %>%
select(!date:diff)
# A tibble: 7 x 4
userID sessionID start end
<dbl> <chr> <date> <date>
1 1 1+2 2019-03-15 2019-03-18
2 1 3+4+5 2019-03-19 2019-03-30
3 2 1 2019-04-05 2019-04-05
4 2 2 2019-06-06 2019-06-06
5 3 1 2019-11-22 2019-11-22
6 3 2+3 2019-12-22 2019-12-24
7 3 4 2020-01-15 2020-01-15
另一种使用 accumulate2
的 tidyverse 策略
df <- data.frame(
userID = c(1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3),
sessionID = c(1, 2, 3, 4, 5, 1, 2, 1, 2, 3, 4),
date = as.Date(c("2019-03-15", "2019-03-18", "2019-03-19", "2019-03-21","2019-03-30", "2019-04-05",
"2019-06-06", "2019-11-22", "2019-12-22", "2019-12-24", "2020-01-15"),
format = "%Y-%m-%d"),
purchase=c(0,1,0,0,0,0,0,0,0,1,0))
library(tidyverse)
df %>%
group_by(userID, grp = cumsum(sessionID == 1) ) %>%
mutate(diff = as.numeric(date - lag(date, default = first(date)))) %>%
group_by(grp2 = accumulate2(diff, purchase[-n()], ~if(..2 > 10 | ..3 == 1) ..1 + 1 else ..1), .add = T) %>%
summarise(sessionID = paste(sessionID, collapse = ' + '),
start_date = first(date),
end_date = last(date), .groups = 'drop') %>%
select(!starts_with('grp'))
#> # A tibble: 7 x 4
#> userID sessionID start_date end_date
#> <dbl> <chr> <date> <date>
#> 1 1 1 + 2 2019-03-15 2019-03-18
#> 2 1 3 + 4 + 5 2019-03-19 2019-03-30
#> 3 2 1 2019-04-05 2019-04-05
#> 4 2 2 2019-06-06 2019-06-06
#> 5 3 1 2019-11-22 2019-11-22
#> 6 3 2 + 3 2019-12-22 2019-12-24
#> 7 3 4 2020-01-15 2020-01-15
由 reprex package (v2.0.0)
于 2021 年 6 月 10 日创建
目前我正在写我的硕士论文,但是,我在多个条件下合并行时遇到了一些问题。我在下面说明了我的问题和期望的结果。我希望你能帮助我 :).
这是我的数据集的示例:
df <- data.frame(
userID = c(1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3),
sessionID = c(1, 2, 3, 4, 5, 1, 2, 1, 2, 3, 4),
date = as.Date(c("2019-03-15", "2019-03-18", "2019-03-19", "2019-03-21","2019-03-30", "2019-04-05",
"2019-06-06", "2019-11-22", "2019-12-22", "2019-12-24", "2020-01-15"),
format = "%Y-%m-%d"),
purchase=c(0,1,0,0,0,0,0,0,0,1,0))
现在,我已经通过 dplyr 的差异计算了差异:
library(dplyr)
df <- df %>%
group_by(userID) %>%
mutate(diff = date - lag(date))
但是,如果它们之间的差异小于 10 天,我想合并这些行。我希望 10 天 window 每次出现 activity(新的 sessionID)时都会重置。另外,purchase为1则停止,第10天window会在有新sessionID时重新开始
我在 dplyr 中尝试了很多使用函数 filter 和 summarize 的东西,但它没有给出想要的结果。另外,我真的不知道怎么加上购买条件。
我想要的结果是这样的:
df2 <- data.frame(
userID = c(1, 1, 2, 2, 3, 3, 3),
sessionID = c("1 + 2", "3 + 4 + 5", "1", "2", "1", "2 + 3", "4"),
date.start = as.Date(c("2019-03-15","2019-03-19", "2019-04-05",
"2019-06-06", "2019-11-22", "2019-12-22", "2020-01-15"),
format = "%Y-%m-%d"),
date.end = as.Date(c("2019-03-18", "2019-03-30", "2019-04-05", "2019-06-06",
"2019-11-22", "2019-12-24", "2020-01-15"), format = "%Y-%m-%d"),
purchase=c(1,0,0,0,0,1,0))
希望你能帮助我:) 提前致谢!
按'userID'分组,通过对'purchase'的lag
进行累加,根据'purchase'中出现1的次数创建一个新组,然后创建另一个基于相邻 'date' 值中的 diff
erence 的分组,即检查差异是否大于或等于 10 天,进行累加和,然后 summarise
'sessionID'通过粘贴(str_c
),得到'date'的first
元素和'date'的last
和'purchase'中1的any
值] 作为摘要列
library(dplyr)
library(stringr)
df %>%
group_by(userID) %>%
group_by( grp = cumsum(lag(purchase,
default = first(purchase))), .add = TRUE) %>%
group_by(cat = cumsum(difftime(date,
lag(date, default = first(date)), units = 'day') >= 10), .add = TRUE ) %>%
summarise(sessionID = str_c(sessionID, collapse= ' + '),
date.start = first(date), date.end = last(date),
purchase = +(any(purchase == 1)), .groups = 'drop' ) %>%
select(-grp, -cat)
-输出
# A tibble: 7 x 5
userID sessionID date.start date.end purchase
<dbl> <chr> <date> <date> <int>
1 1 1 + 2 2019-03-15 2019-03-18 1
2 1 3 + 4 + 5 2019-03-19 2019-03-30 0
3 2 1 2019-04-05 2019-04-05 0
4 2 2 2019-06-06 2019-06-06 0
5 3 1 2019-11-22 2019-11-22 0
6 3 2 + 3 2019-12-22 2019-12-24 1
7 3 4 2020-01-15 2020-01-15 0
献给我亲爱的朋友@akrun 这只是实现最终输出的另一种方式,它不像亲爱的@akrun 提议的那样优雅和简洁。事实上,我在这上面花了几个小时,看到它的结尾对我来说非常重要。然而,我一如既往地受到亲爱的@Akrun 的启发。我希望它对你有用:
library(dplyr)
library(purrr)
df %>%
mutate(cum = cumsum(purchase == 1),
cum = ifelse(cum - lag(cum, default = 0) == 1, lag(cum), cum),
Days = as.numeric(date - lag(date, default = first(date)))) %>%
group_by(cum) %>%
mutate(diff = ifelse(Days < 10, 0, 1)) %>%
ungroup() %>%
mutate(diff = cumsum(diff),
start = date,
end = date) %>%
mutate(across(sessionID, as.character)) %>%
group_split(userID, cum, diff) %>%
map_dfr(~ add_row(.x, userID = .x$userID[1],
sessionID = paste(.x$sessionID, collapse = "+"),
start = .x$date[1], end = .x$date[length(.x$date)])) %>%
filter(if_any(date:diff, ~ is.na(.x))) %>%
select(!date:diff)
# A tibble: 7 x 4
userID sessionID start end
<dbl> <chr> <date> <date>
1 1 1+2 2019-03-15 2019-03-18
2 1 3+4+5 2019-03-19 2019-03-30
3 2 1 2019-04-05 2019-04-05
4 2 2 2019-06-06 2019-06-06
5 3 1 2019-11-22 2019-11-22
6 3 2+3 2019-12-22 2019-12-24
7 3 4 2020-01-15 2020-01-15
另一种使用 accumulate2
df <- data.frame(
userID = c(1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3),
sessionID = c(1, 2, 3, 4, 5, 1, 2, 1, 2, 3, 4),
date = as.Date(c("2019-03-15", "2019-03-18", "2019-03-19", "2019-03-21","2019-03-30", "2019-04-05",
"2019-06-06", "2019-11-22", "2019-12-22", "2019-12-24", "2020-01-15"),
format = "%Y-%m-%d"),
purchase=c(0,1,0,0,0,0,0,0,0,1,0))
library(tidyverse)
df %>%
group_by(userID, grp = cumsum(sessionID == 1) ) %>%
mutate(diff = as.numeric(date - lag(date, default = first(date)))) %>%
group_by(grp2 = accumulate2(diff, purchase[-n()], ~if(..2 > 10 | ..3 == 1) ..1 + 1 else ..1), .add = T) %>%
summarise(sessionID = paste(sessionID, collapse = ' + '),
start_date = first(date),
end_date = last(date), .groups = 'drop') %>%
select(!starts_with('grp'))
#> # A tibble: 7 x 4
#> userID sessionID start_date end_date
#> <dbl> <chr> <date> <date>
#> 1 1 1 + 2 2019-03-15 2019-03-18
#> 2 1 3 + 4 + 5 2019-03-19 2019-03-30
#> 3 2 1 2019-04-05 2019-04-05
#> 4 2 2 2019-06-06 2019-06-06
#> 5 3 1 2019-11-22 2019-11-22
#> 6 3 2 + 3 2019-12-22 2019-12-24
#> 7 3 4 2020-01-15 2020-01-15
由 reprex package (v2.0.0)
于 2021 年 6 月 10 日创建