R - 根据另一个数据框查找每组的重叠日期

Question

我有一个数据框，其中包含来自多个雨量计的降雨量测量值，如下例所示：

> rnfl
     ID       date value
1   250 2000-03-01  5.37
2   250 2000-03-02  0.00
3   250 2000-03-03  2.94
4   250 2000-03-04  0.00
5   250 2000-03-05  0.00
6   250 2000-03-06  0.00
7   250 2000-03-07  2.76
8   250 2000-03-08  3.06
9   250 2000-03-09 31.05
10  250 2000-03-10  9.48
11  250 2000-03-11  0.00
12  250 2000-03-12  0.00
13  250 2000-03-13  0.00
14  732 2011-05-01  2.40
15  732 2011-05-02 15.60
16  732 2011-05-03  8.80
17  732 2011-05-04 47.00
18  732 2011-05-05 45.40
19  732 2011-05-06  5.85
20  732 2011-05-07  0.00
21  732 2011-05-08  0.00
22  732 2011-05-09  0.80
23  732 2011-05-10  0.00
24 1439 2006-08-01  0.00
25 1439 2006-08-02  0.00
26 1439 2006-08-03  0.00
27 1439 2006-08-04  0.00
28 1439 2006-08-05  0.00
29 1439 2006-08-06  0.00
30 1439 2006-08-07  0.00
31 1439 2006-08-08  0.00
32 1440 2000-03-06  0.00
33 1440 2000-03-07  4.57
34 1440 2000-03-08  3.06
35 1440 2000-03-09  9.02
36 1440 2000-03-10  4.23
37 1534 2000-04-01 14.94
38 1534 2000-04-02 43.65
39 1534 2000-04-03  0.00
40 1534 2000-04-04  0.00
41 1534 2000-04-05  0.00

我还有一个数据框，其中包含每个仪表的 ID 以及最近的几个仪表的 ID 及其距离：

> near
    ID ID_nearest distance
1  250       1440  1102.65
2  250        732  3881.40
3  250       1534 15479.97
4  250       1439 19231.39
5  253        499   909.27
6  253         89  2219.03
7  253        815  2452.21
8  254         64 11254.43
9  255        237 11607.83
10 256        416  4503.37
11 256        921 10132.95
12 256       1210 11449.56

例如，仪表 ID 250 有四个近邻：ID 的 1440、732、1534 和 1439。对于 near 中的每个组合，我需要找到主要和周围仪表之间的重叠日期。换句话说，我需要查找仪表 1440、732、1534 和 1439 是否有任何与 ID 250.

重叠的日期

预期的输出是这样的：

   ID ID_nearest common_date_begin  common_date_end diff_days
1 250       1440        2000-03-06       2000-03-10         4
2 250        732              <NA>             <NA>        NA
3 250       1534              <NA>             <NA>        NA
4 250       1439              <NA>             <NA>        NA

near 中的每个 ID 依此类推。

如何实现？非常感谢。

重现此问题所需的数据：

rnfl <- structure(list(ID = c(250L, 250L, 250L, 250L, 250L, 250L, 250L, 
250L, 250L, 250L, 250L, 250L, 250L, 732L, 732L, 732L, 732L, 732L, 
732L, 732L, 732L, 732L, 732L, 1439L, 1439L, 1439L, 1439L, 1439L, 
1439L, 1439L, 1439L, 1440L, 1440L, 1440L, 1440L, 1440L, 1534L, 
1534L, 1534L, 1534L, 1534L), date = structure(c(11017, 11018, 
11019, 11020, 11021, 11022, 11023, 11024, 11025, 11026, 11027, 
11028, 11029, 15095, 15096, 15097, 15098, 15099, 15100, 15101, 
15102, 15103, 15104, 13361, 13362, 13363, 13364, 13365, 13366, 
13367, 13368, 11022, 11023, 11024, 11025, 11026, 11048, 11049, 
11050, 11051, 11052), class = "Date"), value = c(5.37, 0, 2.94, 
0, 0, 0, 2.76, 3.06, 31.05, 9.48, 0, 0, 0, 2.4, 15.6, 8.8, 47, 
45.4, 5.85, 0, 0, 0.8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.57, 3.06, 
9.02, 4.23, 14.94, 43.65, 0, 0, 0)), row.names = c(NA, -41L), class = "data.frame")

near <- structure(list(ID = c("250", "250", "250", "250", "253", "253", 
"253", "254", "255", "256", "256", "256"), ID_nearest = c("1440", 
"732", "1534", "1439", "499", "89", "815", "64", "237", "416", 
"921", "1210"), distance = c(1102.65, 3881.4, 15479.97, 19231.39, 
909.27, 2219.03, 2452.21, 11254.43, 11607.83, 4503.37, 10132.95, 
11449.56)), row.names = c(NA, -12L), class = "data.frame")

Answer 1

也许不是 cleanest/efficient 但这是在 base R 中执行此操作的一种方法。

我们找到 ID 和 ID_nearest 的每个组合的共同日期，如果有任何共同日期，我们创建一个数据框，其中包含日期的最小值、最大值以及数字的差异在他们的日子里。

out <- near[c('ID', 'ID_nearest')]

cbind(out,do.call(rbind, c(Map(function(x, y) {
   common_dates <- intersect(rnfl$date[rnfl$ID == x], rnfl$date[rnfl$ID == y])
   if(length(common_dates) > 0) {
    class(common_dates) <- "Date"
      data.frame(common_date_begin = min(common_dates), 
                 common_date_end = max(common_dates), 
                 diff_days = as.integer(max(common_dates) - min(common_dates)))
   }  else c(common_date_begin = NA, common_date_end = NA, diff_days = NA)
},out$ID, out$ID_nearest), make.row.names = FALSE)))


#    ID ID_nearest common_date_begin common_date_end diff_days
#1  250       1440        2000-03-06      2000-03-10         4
#2  250        732              <NA>            <NA>        NA
#3  250       1534              <NA>            <NA>        NA
#4  250       1439              <NA>            <NA>        NA
#....
#....

Answer 2

使用data.table的选项：

library(data.table)
setDT(near)[, c("ID", "ID_nearest") := lapply(.SD, as.integer), .SDcols=c("ID", "ID_nearest")]
setDT(rnfl)

m <- rnfl[rnfl, on=.(date), {
    k <- x.ID!=i.ID
    unique(data.table(
            ID=i.ID[k], 
            ID_nearest=x.ID[k], 
            common_date_begin=min(date[k]),
            common_date_end=max(date[k])
        ))
}]

m[near, on=.(ID, ID_nearest)][, 
    diff_days := common_date_end - common_date_begin][]

输出：

     ID ID_nearest common_date_begin common_date_end distance diff_days
 1: 250       1440        2000-03-06      2000-03-10  1102.65    4 days
 2: 250        732              <NA>            <NA>  3881.40   NA days
 3: 250       1534              <NA>            <NA> 15479.97   NA days
 4: 250       1439              <NA>            <NA> 19231.39   NA days
 5: 253        499              <NA>            <NA>   909.27   NA days
 6: 253         89              <NA>            <NA>  2219.03   NA days
 7: 253        815              <NA>            <NA>  2452.21   NA days
 8: 254         64              <NA>            <NA> 11254.43   NA days
 9: 255        237              <NA>            <NA> 11607.83   NA days
10: 256        416              <NA>            <NA>  4503.37   NA days
11: 256        921              <NA>            <NA> 10132.95   NA days
12: 256       1210              <NA>            <NA> 11449.56   NA days

对于较大的数据集，在执行重叠连接之前将每个 ID 的每个连续周期的 rnfl 折叠成范围行是有意义的，然后在 near 中查找这些重叠：

#summarize into consecutive periods
summ <- rnfl[, .(startdate=date[1L], enddate=date[.N]),
    .(ID, g=cumsum(c(0L, diff(date)!=1L)))]

#perform overlapping join
setkey(summ, startdate, enddate)
olap <- unique(foverlaps(summ, summ)[ID!=i.ID, .(
    ID1=pmin(ID, i.ID),
    ID2=pmax(ID, i.ID),
    common_date_begin=pmax(startdate, i.startdate),
    common_date_end=pmin(enddate, i.enddate))])

#sorry I forgot to sort the IDs in the original post. have fixed here    
near[, c("ID1", "ID2") := .(pmin(ID, ID_nearest), pmax(ID, ID_nearest))]

#lookup join for overlapping dates and calc dates diff
cols <- c("common_date_begin", "common_date_end")
near[olap, on=.(ID1, ID2), (cols) := mget(paste0("i.", cols))][,
    diff_days := common_date_end - common_date_begin][]

输出：

        ID ID_nearest       dist ID1  ID2 common_date_begin common_date_end diff_days
   1:    1       1117  3022.2234   1 1117        2000-03-01      2006-12-03      2468
   2:    1        386 16107.7359   1  386        2006-01-01      2006-12-03       336
   3:    1        920 17327.0028   1  920        2000-03-01      2004-11-04      1709
   4: 1000        688   401.5005 688 1000        2019-12-25      2019-12-31         6
   5: 1000         48  5576.3986  48 1000        2000-03-01      2006-12-03      2468
  ---                                                                                
2649:  992        318 12462.7490 318  992        2006-01-01      2017-06-16      4184
2650:  996        448     0.0000 448  996        2019-12-25      2019-12-31         6
2651:  997       1085   498.8696 997 1085        2000-03-01      2017-01-22      6171
2652:  997        390 17627.1155 390  997        2003-08-08      2017-01-22      4916
2653:  999        467  5392.2740 467  999        2007-11-14      2019-04-09      4164

我的电脑上的总时间约为 5 秒，包括读取大文件和格式化日期列。处理代码大约需要1.5s。

数据：

#https://www.dropbox.com/s/aadf4w6538lw22q/****_SO.zip?dl=0
near <- fread("near.csv")
rnfl <- fread("rnfl.csv")
lu <- rnfl[, .(date={cd <- unique(date)}, DATE=as.IDate(cd))]
rnfl[lu, on=.(date), date := DATE][, date := as.IDate(as.integer(date))]

R - 根据另一个数据框查找每组的重叠日期

R - find overlapping dates per group based on another data frame

datetime

r

date-range