根据日期在组内拆分重叠行
Splitting overlapping rows, within groups, based on dates
我正在尝试根据现有行的重叠时间段创建新行。例如,我想把这个:
Customer_Product <- data.table(Customer=c("A01","A01","A01", "A02", "A02", "A02", "A03", "A03", "A03"),
Product=c("Prod1","Prod2","Prod3","Prod1","Prod2","Prod3","Prod1","Prod2","Prod3"),
Start_Date=c("1/1/2015", "3/1/2015", "4/1/2015", "1/1/2015", "3/1/2015", "4/1/2015", "1/1/2015", "3/1/2015", "4/1/2015"),
End_Date=c("2/1/2015","5/1/2015","5/1/2015","2/1/2015","5/1/2015","6/1/2015","2/1/2015","6/1/2015","5/1/2015"))
Customer Product Start_Date End_Date
1: A01 Prod1 1/1/2015 2/1/2015
2: A01 Prod2 3/1/2015 5/1/2015
3: A01 Prod3 4/1/2015 5/1/2015
4: A02 Prod1 1/1/2015 2/1/2015
5: A02 Prod2 3/1/2015 5/1/2015
6: A02 Prod3 4/1/2015 6/1/2015
7: A03 Prod1 1/1/2015 2/1/2015
8: A03 Prod2 3/1/2015 6/1/2015
9: A03 Prod3 4/1/2015 5/1/2015
变成这样:
Customer_Product_Combo <- data.table(Customer=c("A01","A01","A01", "A02", "A02", "A02", "A02","A03", "A03","A03","A03"),
Product_or_Combination=c("Prod1","Prod2","Prod2/Prod3","Prod1","Prod2","Prod2/Prod3","Prod3","Prod1","Prod2","Prod2/Prod3","Prod2"),
Start_Date=c("1/1/2015","3/1/2015","4/1/2015","1/1/2015","3/1/2015","4/1/2015","5/1/2015","1/1/2015","3/1/2015","4/1/2015","5/1/2015"),
End_Date=c("2/1/2015","4/1/2015","5/1/2015","2/1/2015","4/1/2015","5/1/2015","6/1/2015","2/1/2015","4/1/2015","5/1/2015","6/1/2015"))
Customer Product_or_Combination Start_Date End_Date
1: A01 Prod1 1/1/2015 2/1/2015
2: A01 Prod2 3/1/2015 4/1/2015
3: A01 Prod2/Prod3 4/1/2015 5/1/2015
4: A02 Prod1 1/1/2015 2/1/2015
5: A02 Prod2 3/1/2015 4/1/2015
6: A02 Prod2/Prod3 4/1/2015 5/1/2015
7: A02 Prod3 5/1/2015 6/1/2015
8: A03 Prod1 1/1/2015 2/1/2015
9: A03 Prod2 3/1/2015 4/1/2015
10: A03 Prod2/Prod3 4/1/2015 5/1/2015
11: A03 Prod2 5/1/2015 6/1/2015
我一直在研究 IRanges,因为看起来 disjoin() 可能是一个可能的解决方案,但我看不到 inherit/merge "Prod" 数据的任何方法。
我也一直在尝试在 dplyr 中使用 lead/lag 勾勒出一些东西,然后是 gather/merge 循环,但同样值得注意的是,我可以有超过 2 "Prod"重叠,然后逻辑就变得混乱了。
有没有合理的方法来做到这一点?非常感谢任何帮助!
我正在使用您发布的数据(作为 data.frame
)
Customer_Product <- data.frame(Customer=c("A01","A01","A01", "A02", "A02", "A02", "A03", "A03", "A03"),
Product=c("Prod1","Prod2","Prod3","Prod1","Prod2","Prod3","Prod1","Prod2","Prod3"),
Start_Date=c("1/1/2015", "3/1/2015", "4/1/2015", "1/1/2015", "3/1/2015", "4/1/2015", "1/1/2015", "3/1/2015", "4/1/2015"),
End_Date=c("2/1/2015","5/1/2015","5/1/2015","2/1/2015","5/1/2015","6/1/2015","2/1/2015","6/1/2015","5/1/2015"))
这是一个可能的解决方案:
library(tidyverse)
library(data.table)
library(lubridate)
Customer_Product %>%
mutate_at(vars(matches("Date")), dmy) %>% # update to date columns (if needed)
mutate(day = map2(Start_Date, End_Date, ~seq(.x, .y, "day"))) %>% # create sequence of days between start and end
unnest() %>% # unnest data
group_by(Customer, day) %>% # for each customer and day
summarise(Product = paste0(Product, collapse = "/")) %>% # find corresponding products
group_by(Customer, Product, id = rleid(Product)) %>% # for each customer, product combination and position of product combination
summarise(Start_Date = min(day), # get start date
End_Date = max(day)) %>% # get end date
ungroup() %>% # ungroup
select(-id) %>% # remove id column
arrange(Customer, Start_Date) # order rows (if needed)
# # A tibble: 11 x 4
# Customer Product Start_Date End_Date
# <fct> <chr> <date> <date>
# 1 A01 Prod1 2015-01-01 2015-01-02
# 2 A01 Prod2 2015-01-03 2015-01-03
# 3 A01 Prod2/Prod3 2015-01-04 2015-01-05
# 4 A02 Prod1 2015-01-01 2015-01-02
# 5 A02 Prod2 2015-01-03 2015-01-03
# 6 A02 Prod2/Prod3 2015-01-04 2015-01-05
# 7 A02 Prod3 2015-01-06 2015-01-06
# 8 A03 Prod1 2015-01-01 2015-01-02
# 9 A03 Prod2 2015-01-03 2015-01-03
#10 A03 Prod2/Prod3 2015-01-04 2015-01-05
#11 A03 Prod2 2015-01-06 2015-01-06
请注意,此解决方案不允许输出中的日期范围重叠 table。
例如,如果您在 4/1/2015 - 5/1/2015
期间获得 Prod2/Prod3
,则在 5/1/2015 - 6/1/2015
期间不会获得 Prod2
,但是 6/1/2015 - 6/1/2015
,因为 [=18] =] 包含在 Prod2/Prod3
.
中
我正在尝试根据现有行的重叠时间段创建新行。例如,我想把这个:
Customer_Product <- data.table(Customer=c("A01","A01","A01", "A02", "A02", "A02", "A03", "A03", "A03"),
Product=c("Prod1","Prod2","Prod3","Prod1","Prod2","Prod3","Prod1","Prod2","Prod3"),
Start_Date=c("1/1/2015", "3/1/2015", "4/1/2015", "1/1/2015", "3/1/2015", "4/1/2015", "1/1/2015", "3/1/2015", "4/1/2015"),
End_Date=c("2/1/2015","5/1/2015","5/1/2015","2/1/2015","5/1/2015","6/1/2015","2/1/2015","6/1/2015","5/1/2015"))
Customer Product Start_Date End_Date 1: A01 Prod1 1/1/2015 2/1/2015 2: A01 Prod2 3/1/2015 5/1/2015 3: A01 Prod3 4/1/2015 5/1/2015 4: A02 Prod1 1/1/2015 2/1/2015 5: A02 Prod2 3/1/2015 5/1/2015 6: A02 Prod3 4/1/2015 6/1/2015 7: A03 Prod1 1/1/2015 2/1/2015 8: A03 Prod2 3/1/2015 6/1/2015 9: A03 Prod3 4/1/2015 5/1/2015
变成这样:
Customer_Product_Combo <- data.table(Customer=c("A01","A01","A01", "A02", "A02", "A02", "A02","A03", "A03","A03","A03"),
Product_or_Combination=c("Prod1","Prod2","Prod2/Prod3","Prod1","Prod2","Prod2/Prod3","Prod3","Prod1","Prod2","Prod2/Prod3","Prod2"),
Start_Date=c("1/1/2015","3/1/2015","4/1/2015","1/1/2015","3/1/2015","4/1/2015","5/1/2015","1/1/2015","3/1/2015","4/1/2015","5/1/2015"),
End_Date=c("2/1/2015","4/1/2015","5/1/2015","2/1/2015","4/1/2015","5/1/2015","6/1/2015","2/1/2015","4/1/2015","5/1/2015","6/1/2015"))
Customer Product_or_Combination Start_Date End_Date 1: A01 Prod1 1/1/2015 2/1/2015 2: A01 Prod2 3/1/2015 4/1/2015 3: A01 Prod2/Prod3 4/1/2015 5/1/2015 4: A02 Prod1 1/1/2015 2/1/2015 5: A02 Prod2 3/1/2015 4/1/2015 6: A02 Prod2/Prod3 4/1/2015 5/1/2015 7: A02 Prod3 5/1/2015 6/1/2015 8: A03 Prod1 1/1/2015 2/1/2015 9: A03 Prod2 3/1/2015 4/1/2015 10: A03 Prod2/Prod3 4/1/2015 5/1/2015 11: A03 Prod2 5/1/2015 6/1/2015
我一直在研究 IRanges,因为看起来 disjoin() 可能是一个可能的解决方案,但我看不到 inherit/merge "Prod" 数据的任何方法。
我也一直在尝试在 dplyr 中使用 lead/lag 勾勒出一些东西,然后是 gather/merge 循环,但同样值得注意的是,我可以有超过 2 "Prod"重叠,然后逻辑就变得混乱了。
有没有合理的方法来做到这一点?非常感谢任何帮助!
我正在使用您发布的数据(作为 data.frame
)
Customer_Product <- data.frame(Customer=c("A01","A01","A01", "A02", "A02", "A02", "A03", "A03", "A03"),
Product=c("Prod1","Prod2","Prod3","Prod1","Prod2","Prod3","Prod1","Prod2","Prod3"),
Start_Date=c("1/1/2015", "3/1/2015", "4/1/2015", "1/1/2015", "3/1/2015", "4/1/2015", "1/1/2015", "3/1/2015", "4/1/2015"),
End_Date=c("2/1/2015","5/1/2015","5/1/2015","2/1/2015","5/1/2015","6/1/2015","2/1/2015","6/1/2015","5/1/2015"))
这是一个可能的解决方案:
library(tidyverse)
library(data.table)
library(lubridate)
Customer_Product %>%
mutate_at(vars(matches("Date")), dmy) %>% # update to date columns (if needed)
mutate(day = map2(Start_Date, End_Date, ~seq(.x, .y, "day"))) %>% # create sequence of days between start and end
unnest() %>% # unnest data
group_by(Customer, day) %>% # for each customer and day
summarise(Product = paste0(Product, collapse = "/")) %>% # find corresponding products
group_by(Customer, Product, id = rleid(Product)) %>% # for each customer, product combination and position of product combination
summarise(Start_Date = min(day), # get start date
End_Date = max(day)) %>% # get end date
ungroup() %>% # ungroup
select(-id) %>% # remove id column
arrange(Customer, Start_Date) # order rows (if needed)
# # A tibble: 11 x 4
# Customer Product Start_Date End_Date
# <fct> <chr> <date> <date>
# 1 A01 Prod1 2015-01-01 2015-01-02
# 2 A01 Prod2 2015-01-03 2015-01-03
# 3 A01 Prod2/Prod3 2015-01-04 2015-01-05
# 4 A02 Prod1 2015-01-01 2015-01-02
# 5 A02 Prod2 2015-01-03 2015-01-03
# 6 A02 Prod2/Prod3 2015-01-04 2015-01-05
# 7 A02 Prod3 2015-01-06 2015-01-06
# 8 A03 Prod1 2015-01-01 2015-01-02
# 9 A03 Prod2 2015-01-03 2015-01-03
#10 A03 Prod2/Prod3 2015-01-04 2015-01-05
#11 A03 Prod2 2015-01-06 2015-01-06
请注意,此解决方案不允许输出中的日期范围重叠 table。
例如,如果您在 4/1/2015 - 5/1/2015
期间获得 Prod2/Prod3
,则在 5/1/2015 - 6/1/2015
期间不会获得 Prod2
,但是 6/1/2015 - 6/1/2015
,因为 [=18] =] 包含在 Prod2/Prod3
.