按R中最近的日期合并两个数据框

Merge two dataframes by nearest date in R

我有两个数据框,我想在最近的日期前合并它们。 df2 通常有相同的日期适合 df1。还有 df2 的值根本不适合 df1。合并后我想知道合并日期之间的时差。例如,运送 Sally 适合两个 Peaks,运送 Carl 不适合任何人。 我尝试了不同的解决方案,例如使用 data.table roll='nearest' 函数

df1 <- data.frame(date = (c("23.11.2021 20:56:06", "23.11.2021 20:56:07","23.11.2021 20:56:08",
                                      "23.11.2021 20:56:09","23.11.2021 20:56:10",
                                      "23.11.2021 20:56:11", "23.11.2021 22:13:56",
                                      "23.11.2021 22:13:57", "23.11.2021 22:13:58",
                                      "23.11.2021 22:13:59", "24.11.2021 03:23:21",
                                      "24.11.2021 03:23:22", "24.11.2021 03:23:23",
                                      "24.11.2021 03:23:24", "24.11.2021 03:23:25",
                                      "24.11.2021 03:24:34", "24.11.2021 03:24:35", 
                                      "24.11.2021 03:24:36", "24.11.2021 03:24:37")),
                  value = (c(500, 900, 1000, 200, 300, 10, 1000, 450, 950, 600,
                             100, 750, 150, 200, 300, 400, 900, 1020, 800)))

df2 <- data.frame(date = (c("23.11.2021 20:55:47", "23.11.2021 21:17:48",
                            "23.11.2021 21:44:19", "23.11.2021 21:55:57",
                            "23.11.2021 22:16:01","23.11.2021 22:26:35",
                            "23.11.2021 22:28:43", "23.11.2021 23:14:52", 
                            "23.11.2021 23:32:30","23.11.2021 23:33:43",
                            "24.11.2021 03:11:30", "24.11.2021 03:23:37",
                            "24.11.2021 04:42:41", "24.11.2021 04:53:09",
                            "24.11.2021 04:58:19")),
                  value = (c("Ship Emma", "Ship Carl", "Ship Rudi", "Ship Tim", "Ship Amy",
                             "Ship Fred", "Ship Noemi"
                             , "Ship Fanny", "Ship Ole", "Ship Ally",
                             "Ship Hugo", "Ship Sally", "Ship Peter", "Ship Harry", "Ship Piet")))

result <- data.frame(date = (c("23.11.2021 20:56:06", "23.11.2021 20:56:07",
                               "23.11.2021 20:56:08", "23.11.2021 20:56:09",
                               "23.11.2021 20:56:10","23.11.2021 20:56:11",
                               "23.11.2021 22:13:56", "23.11.2021 22:13:57",
                               "23.11.2021 22:13:58", "23.11.2021 22:13:59",
                               "24.11.2021 03:23:21", "24.11.2021 03:23:22",
                               "24.11.2021 03:23:23", "24.11.2021 03:23:24",
                               "24.11.2021 03:23:25", "24.11.2021 03:24:34",
                               "24.11.2021 03:24:35", "24.11.2021 03:24:36",
                               "24.11.2021 03:24:37")),
                     value = (c(500, 900, 1000, 200, 300, 10, 1000, 450, 950, 600,
                             100, 750, 150, 200, 300, 400, 900, 1020, 800)), 
                     id = (c("Ship Emma", "Ship Emma", "Ship Emma", "Ship Emma", "Ship Emma",
                             "Ship Emma", "Ship Amy", "Ship Amy", "Ship Amy", "Ship Amy",
                                "Ship Sally", "Ship Sally", "Ship Sally", "Ship Sally",
                             "Ship Sally", "Ship Sally", "Ship Sally", "Ship Sally","Ship Sally")), 
                     difference = (c("00:00:19", "00:00:20", "00:00:21", "00:00:22", "00:00:23",
                                     "00:00:24", "00:00:05", "00:00:04", "00:00:03", "00:00:02",
                                     "00:00:16", "00:00:15", "00:00:14", "00:00:13", "00:00:12",
                                     "00:00:57", "00:00:58", "00:00:59", "00:01:00")))

df1$date<- as.POSIXct(df1$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")
df2$date<- as.POSIXct(df2$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")
result$date<- as.POSIXct(result$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")

我建议两种方法。第一个使用距离矩阵并执行 df1 到 df2 的 left_join。即距离矩阵由下式给出:

dateDist <- outer(pull(df1, date), pull(df2, date), "-") %>%
    abs()

接下来,对于df1的每一行,距离最近的df2行由下式给出:

  closest.df1 <- apply(dateDist, 1, which.min)

最后手动合并:

cbind(rename_with(df1, ~paste0("df1.", "", .x)),
      rename_with(df2[closest.df1,], ~paste0("df2.", "", .x)))

##>+                 df1.date df1.value            df2.date  df2.value
##>1    2021-11-23 20:56:06       500 2021-11-23 20:55:47  Ship Emma
##>1.1  2021-11-23 20:56:07       900 2021-11-23 20:55:47  Ship Emma
##>1.2  2021-11-23 20:56:08      1000 2021-11-23 20:55:47  Ship Emma
##>1.3  2021-11-23 20:56:09       200 2021-11-23 20:55:47  Ship Emma
##>1.4  2021-11-23 20:56:10       300 2021-11-23 20:55:47  Ship Emma
##>1.5  2021-11-23 20:56:11        10 2021-11-23 20:55:47  Ship Emma
##>5    2021-11-23 22:13:56      1000 2021-11-23 22:16:01   Ship Amy
##>5.1  2021-11-23 22:13:57       450 2021-11-23 22:16:01   Ship Amy
##>5.2  2021-11-23 22:13:58       950 2021-11-23 22:16:01   Ship Amy
##>5.3  2021-11-23 22:13:59       600 2021-11-23 22:16:01   Ship Amy
##>12   2021-11-24 03:23:21       100 2021-11-24 03:23:37 Ship Sally
##>12.1 2021-11-24 03:23:22       750 2021-11-24 03:23:37 Ship Sally
##>12.2 2021-11-24 03:23:23       150 2021-11-24 03:23:37 Ship Sally
##>12.3 2021-11-24 03:23:24       200 2021-11-24 03:23:37 Ship Sally
##>12.4 2021-11-24 03:23:25       300 2021-11-24 03:23:37 Ship Sally
##>12.5 2021-11-24 03:24:34       400 2021-11-24 03:23:37 Ship Sally
##>12.6 2021-11-24 03:24:35       900 2021-11-24 03:23:37 Ship Sally
##>12.7 2021-11-24 03:24:36      1020 2021-11-24 03:23:37 Ship Sally
##>12.8 2021-11-24 03:24:37       800 2021-11-24 03:23:37 Ship Sally

第二种方法涉及首先计算 df1df2 的所有行的笛卡尔积,然后仅选择距离最小的行。这里的技巧是使用 inner_join(..., by =character()) 来获取两个数据帧的所有组合:

mutate(df1, id = row_number()) %>%
    inner_join(mutate(df2, id = row_number()),by = character()) |>
    mutate(dist = abs(date.x - date.y)) |>
    group_by(id.x) |>
    filter(dist == min(dist)) |>
    select(-id.x, -id.y, -dist)

  ##>+ # A tibble: 19 × 7
  ##># Groups:   id.x [19]
  ##>   date.x              value.x  id.x date.y              value.y     id.y dist  
  ##>   <dttm>                <dbl> <int> <dttm>              <chr>      <int> <drtn>
  ##> 1 2021-11-23 20:56:06     500     1 2021-11-23 20:55:47 Ship Emma      1  19 s…
  ##> 2 2021-11-23 20:56:07     900     2 2021-11-23 20:55:47 Ship Emma      1  20 s…
  ##> 3 2021-11-23 20:56:08    1000     3 2021-11-23 20:55:47 Ship Emma      1  21 s…
  ##> 4 2021-11-23 20:56:09     200     4 2021-11-23 20:55:47 Ship Emma      1  22 s…
  ##> 5 2021-11-23 20:56:10     300     5 2021-11-23 20:55:47 Ship Emma      1  23 s…
  ##> 6 2021-11-23 20:56:11      10     6 2021-11-23 20:55:47 Ship Emma      1  24 s…
  ##> 7 2021-11-23 22:13:56    1000     7 2021-11-23 22:16:01 Ship Amy       5 125 s…
  ##> 8 2021-11-23 22:13:57     450     8 2021-11-23 22:16:01 Ship Amy       5 124 s…
  ##> 9 2021-11-23 22:13:58     950     9 2021-11-23 22:16:01 Ship Amy       5 123 s…
  ##>10 2021-11-23 22:13:59     600    10 2021-11-23 22:16:01 Ship Amy       5 122 s…
  ##>11 2021-11-24 03:23:21     100    11 2021-11-24 03:23:37 Ship Sally    12  16 s…
  ##>12 2021-11-24 03:23:22     750    12 2021-11-24 03:23:37 Ship Sally    12  15 s…
  ##>13 2021-11-24 03:23:23     150    13 2021-11-24 03:23:37 Ship Sally    12  14 s…
  ##>14 2021-11-24 03:23:24     200    14 2021-11-24 03:23:37 Ship Sally    12  13 s…
  ##>15 2021-11-24 03:23:25     300    15 2021-11-24 03:23:37 Ship Sally    12  12 s…
  ##>16 2021-11-24 03:24:34     400    16 2021-11-24 03:23:37 Ship Sally    12  57 s…
  ##>17 2021-11-24 03:24:35     900    17 2021-11-24 03:23:37 Ship Sally    12  58 s…
  ##>18 2021-11-24 03:24:36    1020    18 2021-11-24 03:23:37 Ship Sally    12  59 s…
  ##>19 2021-11-24 03:24:37     800    19 2021-11-24 03:23:37 Ship Sally    12  60 s…