如何根据 R 中的日期条件将数据帧拆分为新的数据帧

How to split a dataframe into new dataframes based on date condition in R

如果前一个日期与当前日期相差超过 5 天,我需要将一个数据框拆分成多个 new/different 个数据框。

例如: 如果日期列表如下:

2016-11-21
2016-11-22
2016-11-22
2016-11-24
2016-11-30
2016-12-01
2016-12-02
2016-12-10
2016-12-12
2016-12-20
...

我想为相差超过 5 天的每个日期创建一个新数据框(同时将所有信息保留在其他列中 - 为简单起见,此处未显示):

df1:
2016-11-21
2016-11-22
2016-11-22
2016-11-24

df2:
2016-11-30
2016-12-01
2016-12-02

df3:
2016-12-10
2016-12-12

df4:
2016-12-20
...

到目前为止我已经能够使用:

#split dataframe
split(d, as_date(d$date) <= lag(as_date(d$date) + days(5))) -> test
#set list as datatable
dt_list <- map(test, as.data.table)
#bind lists into new dataframe
dt <- rbindlist(dt_list, fill = TRUE, idcol = T)

这根据这些条件给了我一个 true 或 false 的数据框-(不是我想要的):

     id     date                lc      lon   lat Timestamp  difftime
   <chr>  <dttm>              <fct> <dbl> <dbl> <date>     <lgl>                                                     
 1 158696 2016-12-11 05:26:38 A      135. -3.14 2016-12-11 FALSE                                                     
 2 158696 2016-12-26 08:13:44 B      135. -3.28 2016-12-26 FALSE                                                     
 3 158696 2017-01-09 06:35:37 A      136. -3.14 2017-01-09 FALSE                                                     
 4 158696 2017-02-02 08:30:50 0      135. -3.22 2017-02-02 FALSE                                                     
 5 158696 2017-02-18 18:28:08 B      135. -3.27 2017-02-18 FALSE  
 6 158696 2016-11-21 05:46:52 B      135. -3.22 2016-11-21 TRUE                                                      
 7 158696 2016-11-21 05:46:52 B      135. -3.22 2016-11-21 TRUE                                                      
 8 158696 2016-11-21 08:04:15 B      135. -3.22 2016-11-21 TRUE                                                      
 9 158696 2016-11-21 08:21:10 B      135. -3.22 2016-11-21 TRUE                                                      
 10 158696 2016-11-21 20:34:23 B      135. -3.22 2016-11-21 TRUE 

我很乐意使用与上面完全不同的代码,但如果可能的话我更愿意使用 Dplyr,因为我最熟悉它。

这是我的数据框的输入:

structure(list(id = c("158696", "158696", "158696", "158696", 
"158696", "158696", "158696", "158696", "158696", "158696", "158696", 
"158696", "158696", "158696", "158696", "158696", "158696", "158696", 
"158696", "158696", "158696", "158696", "158696", "158696", "158696", 
"158696", "158696", "158696", "158696", "158696", "158696", "158696", 
"158696", "158696", "158696", "158696", "158696", "158696", "158696", 
"158696", "158696", "158696", "158696", "158696", "158696", "158696", 
"158696", "158696", "158696", "158696", "158696"), date = structure(c(1481672621, 
1482740024, 1482888520, 1483943737, 1483954584, 1483957732, 1484351217, 
1484372486, 1484635601, 1484639873, 1484649374, 1484654932, 1484910955, 
1484914028, 1484992088, 1485077809, 1485083628, 1485116630, 1485155703, 
1485241506, 1485245737, 1485302728, 1485394165, 1485407109, 1486024250, 
1486026218, 1486153828, 1486184167, 1486243717, 1486244406, 1486244406, 
1486244406, 1487442488, 1487462032, 1487488051, 1487495298, 1487507472, 
1487553204, 1487573823, 1487756502, 1487756774, 1487758235, 1487763098, 
1487795742, 1487812234, 1487929687, 1487932240, 1487979233, 1488246245, 
1488315015, 1488793040), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    lc = structure(c(6L, 6L, 5L, 5L, 5L, 6L, 6L, 5L, 5L, 6L, 
    6L, 6L, 2L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 6L, 6L, 6L, 1L, 
    6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 
    6L, 6L, 6L, 6L, 6L, 2L, 6L, 6L, 6L, 6L, 6L), .Label = c("0", 
    "1", "2", "3", "A", "B"), class = "factor"), lon = c(134.9559, 
    134.9635, 135.0038, 135.5555, 135.5994, 135.6039, 135.5398, 
    135.4953, 135.5485, 135.5502, 135.5447, 135.5461, 135.0267, 
    135.0148, 135.0258, 135.0221, 135.0013, 135.0037, 135.0306, 
    134.9676, 134.9523, 134.8742, 134.8938, 134.8815, 135.0038, 
    135.0047, 134.9429, 134.9381, 134.9218, 134.929, 134.9092, 
    134.9218, 135.045, 135.0479, 135.0794, 135.0884, 135.0796, 
    135.087, 135.092, 135.1609, 135.1625, 135.1626, 135.1649, 
    135.1601, 135.1714, 135.0644, 135.064, 135.0759, 135.0841, 
    135.0849, 134.9314), lat = c(-3.148, -3.275, -3.2625, -3.1398, 
    -3.1349, -3.1339, -2.5095, -2.511, -2.6931, -2.6923, -2.7032, 
    -2.7059, -3.2218, -3.2327, -3.2284, -3.2234, -3.2688, -3.2888, 
    -3.2257, -3.2096, -3.208, -3.2129, -3.179, -3.1945, -3.2158, 
    -3.2157, -3.1747, -3.1701, -3.1604, -3.1735, -3.1735, -3.1735, 
    -3.2697, -3.2623, -3.3155, -3.3182, -3.3046, -3.3131, -3.3125, 
    -3.2767, -3.2765, -3.2766, -3.2769, -3.2808, -3.283, -3.2069, 
    -3.2073, -3.1983, -3.312, -3.2977, -3.1022)), row.names = 47:97, class = "data.frame")

其中有几个日期相差超过 5 天。我是 R 的新手,非常感谢您的帮助!

我们可以在这里使用 lead() 为每个日期岛创建一个伪组。然后,拆分该岛值。

d$gap <- ifelse(as.numeric(difftime(d$date, lead(d$date), units = "days")) > 5, 1, 0)
d$grp <- cumsum(df$gap)
dt_list <- split(d, d$grp)
library(dplyr)
df_grp = df %>% mutate(grp = cumsum(c(1, diff(date) > 5*86400)))
split(df_grp, df_grp$grp)