在查找 table 中查找具有不精确匹配和矢量化代码的行索引

Finding row index in lookup table with inexact matches and vectorized code in r

我实际上是在尝试连接两组数据。但是,数据具有不完全匹配(例如,主数据中的日期在查找的开始日期和结束日期之间 table)。我知道 fuzzy_join 有效,但由于内存问题 space 我无法使用它,这迫使我使用 for 循环。数据集继续增长(> 3,000,000 行)并且 for 循环现在需要几天时间才能 运行,所以我试图找到一种更快的方法来索引查找中的行 table.

以下是我目前正在做的事情的一个非常简化的版本。抱歉,数据有点不合理(这是我可以很快想出的假数据集)。

如果有一种方法可以在不使用循环的情况下使用多条件 which 语句,我将非常满意——即使它只输出与该行匹配的查找 table 的行号对主要数据集感兴趣。

df<-cars# main data

#Creating matching data for a lookup table
lookup<-cars[-which(duplicated(cars)),]
lookup$location<-state.abb[-1]

#linking the main data to the lookup table analogous to a left_join
for (i in 1:nrow(df)) {
  row <- which(lookup[,"speed"] == df[i,"speed"] &
                 lookup[,"dist"]==df[i,"dist"])
  if (length(row)==1){df[i,"location"]<-lookup[row,"location"]}
  else if (length(row)!=1){df[i,"location"]<-"Error"}
}

以下是一个示例,其中数据更符合其他用户请求的问题。

Lookup<-structure(list(Deploy = structure(c(18575, 18575, 18575, 18575, 
18575, 18464, 18573, 18573, 18573), class = "Date"), Retrieved = structure(c(18978, 
18978, 18978, 18978, 18978, 18978, 18978, 18978, 18978), class = "Date"), 
    Station = c("484451", "484449", "484450", "484452", "484453", 
    "483231", "484457", "484456", "484455"), Name = c("A", "B", 
    "C", "D", "E", "F", "G", "H", "I")), row.names = c(7L, 8L, 
9L, 10L, 11L, 16L, 27L, 33L, 34L), class = "data.frame")

df<-structure(list(DateTime = c("2020-11-07 14:48:17 CST", "2021-04-24 12:31:50 CDT", 
"2021-04-28 00:03:56 CDT", "2021-04-30 21:53:25 CDT", "2021-05-08 08:22:37 CDT", 
"2021-05-14 13:01:15 CDT", "2021-05-20 02:29:43 CDT", "2021-05-24 13:54:15 CDT", 
"2021-05-30 04:11:33 CDT", "2021-06-03 03:38:52 CDT", "2021-06-06 01:40:47 CDT", 
"2021-06-09 04:05:07 CDT", "2021-06-13 22:55:06 CDT", "2021-06-18 15:19:01 CDT", 
"2021-06-30 01:12:33 CDT", "2021-08-13 19:04:24 CDT", "2021-09-07 19:41:25 CDT", 
"2021-11-19 19:36:32 CST", "2021-12-02 19:37:50 CST", "2021-03-31 07:50:39 CDT", 
"2021-04-16 02:25:56 CDT", "2021-04-26 23:12:17 CDT", "2021-05-03 15:00:23 CDT", 
"2021-05-12 05:50:45 CDT", "2021-05-18 15:00:19 CDT", "2021-05-28 22:52:07 CDT", 
"2021-06-05 15:29:18 CDT", "2021-06-12 11:18:47 CDT", "2021-06-19 04:10:22 CDT", 
"2021-06-30 10:11:13 CDT", "2021-08-14 08:21:32 CDT", "2021-06-28 15:54:43 CDT", 
"2021-07-04 12:44:51 CDT", "2021-02-14 22:28:41 CST", "2021-03-14 15:14:28 CDT", 
"2021-04-16 05:10:42 CDT", "2021-04-20 00:52:10 CDT", "2021-04-23 06:11:06 CDT", 
"2021-04-26 16:45:59 CDT", "2021-04-30 03:43:53 CDT", "2021-05-03 11:11:58 CDT", 
"2021-05-07 15:27:46 CDT", "2021-05-11 05:51:15 CDT", "2021-05-14 06:06:39 CDT", 
"2021-05-17 07:08:14 CDT", "2021-05-20 08:26:21 CDT", "2021-05-23 16:48:55 CDT", 
"2021-05-27 23:53:04 CDT", "2021-05-31 17:05:24 CDT", "2021-06-03 21:19:24 CDT", 
"2021-06-07 00:20:27 CDT", "2021-06-10 16:06:36 CDT", "2021-06-13 20:26:05 CDT", 
"2021-06-17 04:28:53 CDT", "2021-06-20 15:26:44 CDT", "2021-06-24 05:24:39 CDT", 
"2021-06-28 13:57:09 CDT", "2021-07-02 12:13:23 CDT", "2021-07-05 23:31:31 CDT", 
"2021-07-09 06:34:07 CDT", "2021-07-13 17:30:26 CDT", "2021-07-17 19:38:44 CDT", 
"2021-07-21 17:24:29 CDT", "2021-07-26 19:54:42 CDT", "2021-07-31 15:53:39 CDT", 
"2021-08-05 05:57:23 CDT", "2021-08-09 14:34:04 CDT", "2021-08-13 20:34:30 CDT", 
"2021-08-17 21:23:28 CDT", "2021-08-21 04:37:26 CDT", "2021-08-24 03:23:49 CDT", 
"2021-08-28 11:22:34 CDT", "2021-08-31 23:20:26 CDT", "2021-09-08 03:54:07 CDT", 
"2021-09-11 23:29:38 CDT", "2021-09-16 00:53:41 CDT", "2021-09-25 02:26:09 CDT", 
"2021-09-29 22:28:42 CDT", "2021-10-05 03:27:17 CDT", "2021-10-08 23:44:34 CDT", 
"2021-10-12 11:56:38 CDT", "2021-10-20 02:28:34 CDT", "2021-11-05 19:02:55 CDT", 
"2021-03-09 05:58:50 CST", "2021-04-13 16:46:42 CDT", "2021-04-18 14:34:43 CDT", 
"2021-04-25 08:38:19 CDT", "2021-05-06 00:53:20 CDT", "2021-05-12 16:45:31 CDT", 
"2021-05-21 23:56:06 CDT", "2021-06-05 20:38:45 CDT", "2021-06-16 14:57:06 CDT", 
"2021-07-05 14:54:37 CDT", "2021-07-25 23:25:06 CDT", "2021-08-15 11:22:27 CDT", 
"2021-08-31 21:25:10 CDT", "2021-10-03 18:53:57 CDT", "2021-11-08 16:03:34 CST", 
"2021-02-16 01:15:13 CST", "2021-03-06 00:28:03 CST"), Station = c("483231", 
"484449", "484449", "484449", "484449", "484449", "484449", "484449", 
"484449", "484449", "484449", "484449", "484449", "484449", "484449", 
"484449", "484449", "484449", "484449", "484450", "484450", "484450", 
"484450", "484450", "484450", "484450", "484450", "484450", "484450", 
"484450", "484450", "484451", "484453", "484455", "484455", "484455", 
"484455", "484455", "484455", "484455", "484455", "484455", "484455", 
"484455", "484455", "484455", "484455", "484455", "484455", "484455", 
"484455", "484455", "484455", "484455", "484455", "484455", "484455", 
"484455", "484455", "484455", "484455", "484455", "484455", "484455", 
"484455", "484455", "484455", "484455", "484455", "484455", "484455", 
"484455", "484455", "484455", "484455", "484455", "484455", "484455", 
"484455", "484455", "484455", "484455", "484455", "484456", "484456", 
"484456", "484456", "484456", "484456", "484456", "484456", "484456", 
"484456", "484456", "484456", "484456", "484456", "484456", "484457", 
"484457"), ID = c("26382", "52448", "52448", "52448", "59597", 
"52481", "59685", "59597", "59625", "52448", "59607", "59597", 
"52476", "52452", "52443", "52490", "52452", "59607", "52484", 
"52478", "52459", "52476", "52452", "52456", "52448", "52476", 
"52490", "52461", "52452", "59597", "52488", "52481", "52465", 
"52430", "52434", "52553", "52545", "52433", "52497", "52439", 
"52535", "59617", "59620", "59686", "52553", "54225", "54225", 
"50313", "50313", "52549", "52549", "59620", "28726", "52436", 
"56724", "59683", "52436", "50315", "59683", "59684", "52555", 
"52431", "28723", "59683", "52431", "52431", "52497", "52559", 
"52497", "52541", "52523", "59684", "52436", "59683", "59684", 
"59684", "52431", "59617", "52436", "52436", "52555", "52436", 
"59684", "52497", "59617", "52522", "52497", "52549", "52535", 
"59686", "52496", "52436", "59617", "52497", "59682", "52436", 
"59683", "59685", "52430", "52438")), class = "data.frame", row.names = c(15682L, 
113503L, 123508L, 133522L, 143529L, 153531L, 163546L, 173553L, 
183563L, 193570L, 203578L, 213582L, 223586L, 233588L, 243592L, 
253608L, 263609L, 279539L, 291018L, 308142L, 320585L, 332244L, 
343277L, 354611L, 365588L, 377103L, 388219L, 399208L, 410181L, 
421791L, 438325L, 490101L, 570616L, 581633L, 595778L, 610533L, 
621113L, 631630L, 642195L, 652737L, 663238L, 674401L, 685098L, 
695576L, 706043L, 716540L, 727054L, 737734L, 748313L, 758811L, 
769297L, 779851L, 790361L, 800892L, 811420L, 821980L, 832634L, 
843246L, 853770L, 864265L, 874930L, 885540L, 896129L, 906876L, 
917594L, 928287L, 938949L, 949584L, 960180L, 970680L, 981116L, 
991790L, 1002945L, 1014113L, 1025331L, 1036973L, 1049346L, 1060826L, 
1072594L, 1084158L, 1095904L, 1108937L, 1123610L, 1142055L, 1157723L, 
1168503L, 1179545L, 1191392L, 1202620L, 1214257L, 1226959L, 1238952L, 
1252276L, 1265805L, 1279392L, 1292293L, 1308823L, 1328560L, 1342239L, 
1354986L))

for (i in 1:nrow(df)){
  Row<-which(Lookup[,"Station"]==df[i,"Station"] & 
               Lookup[,"Deploy"]<=df[i,"DateTime"] &
               Lookup[,"Retrieved"]>=df[i,"DateTime"])
  if (length(Row)==1){df[i,"Location"]<-Lookup[Row,"Name"]}
  else if (length(Row)!=1){df[i,"Location"]<-"Error"}
}

一种方法是使用 inequality join with the sqldf package. See Join two datasets based on an inequality condition。重要的是inner join涉及的四行。

如果您真的要将日期时间(在 df 中)与日期(在查找中)进行比较,事情会变得简单,因为您可以 cast/keep 一切都作为日期。

此代码假定:

  1. 所有时区都是相同的。
  2. 站号必须匹配。
  3. DateTime 值必须受 DeployRetrieved 限制。
df <-
  df |> 
  dplyr::mutate(
    Date = as.Date(DateTime),
  )

sql <- 
  "
    SELECT
      df.ID
      ,df.Station
      ,df.Date
      ,Lookup.Deploy
      ,Lookup.Retrieved
      ,Lookup.Name
    FROM df
      inner join Lookup on 
        df.Station = Lookup.Station
        and
        df.Date between Lookup.Deploy and Lookup.Retrieved
  "

sqldf::sqldf(sql) |> 
  tibble::as_tibble()

输出:

# A tibble: 100 x 6
   ID    Station Date       Deploy     Retrieved  Name 
   <chr> <chr>   <date>     <date>     <date>     <chr>
 1 26382 483231  2020-11-07 2020-07-21 2021-12-17 F    
 2 52448 484449  2021-04-24 2020-11-09 2021-12-17 B    
 3 52448 484449  2021-04-28 2020-11-09 2021-12-17 B    
 4 52448 484449  2021-04-30 2020-11-09 2021-12-17 B    
 5 59597 484449  2021-05-08 2020-11-09 2021-12-17 B    
 6 52481 484449  2021-05-14 2020-11-09 2021-12-17 B    
 7 59685 484449  2021-05-20 2020-11-09 2021-12-17 B    
 8 59597 484449  2021-05-24 2020-11-09 2021-12-17 B    
 9 59625 484449  2021-05-30 2020-11-09 2021-12-17 B    
10 52448 484449  2021-06-03 2020-11-09 2021-12-17 B    
# ... with 90 more rows

替代版本: 如果出于某种原因您需要将日期与日期时间进行比较(例如,时区不相同),则需要做一些额外的工作来对齐数据类型。

df <-
  df |> 
  dplyr::mutate(
    DateTime = as.POSIXct(DateTime),
  )

Lookup <-
  Lookup |>
  dplyr::mutate(
    DeployDT    = as.POSIXct(paste0(Deploy    , "00:00:00")),
    RetrievedDT = as.POSIXct(paste0(Retrieved , "00:00:00")),
  )

sql <- 
  "
    SELECT
      df.ID
      ,df.Station
      ,df.DateTime
      ,Lookup.DeployDT
      ,Lookup.RetrievedDT
      ,Lookup.Name
    FROM df
      inner join Lookup on 
        df.Station = Lookup.Station
        and
        df.DateTime between Lookup.DeployDT and Lookup.RetrievedDT
        
        -- equivalent to
        -- (Lookup.DeployDT <= df.DateTime) and (df.DateTime <= Lookup.RetrievedDT)
  "

sqldf::sqldf(sql) |> 
  tibble::as_tibble() |> 
  dplyr::mutate(
    Deploy    = as.Date(DeployDT),
    Retrieved = as.Date(RetrievedDT),
  ) |> 
  dplyr::select(
    -DeployDT,
    -RetrievedDT,
  )

这是 data.table 包的完美任务,它以速度快和内存效率着称。考虑以下实现。

library(data.table)

setDT(Lookup)
setDT(df)
tz <- function(x) as.POSIXct(x, "America/Chicago")
cols <- c("Deploy", "Retrieved")

df[, DateTime := tz(DateTime)]
Lookup[, (cols) := lapply(.SD, \(x) tz(format(x))), .SDcols = cols]

df[Lookup, Location := Name, on = .(
  Station, DateTime <= Retrieved, DateTime >= Deploy
)][, Location := fifelse(is.na(Location), "Error", Location)]

请注意,此处几乎所有内容都是通过引用完成的,因此不会创建额外的副本。只要您可以将 df 完全加载到内存中,这应该就可以工作。

输出

                DateTime Station    ID Location
  1: 2020-11-07 14:48:17  483231 26382        F
  2: 2021-04-24 12:31:50  484449 52448        B
  3: 2021-04-28 00:03:56  484449 52448        B
  4: 2021-04-30 21:53:25  484449 52448        B
  5: 2021-05-08 08:22:37  484449 59597        B
  6: 2021-05-14 13:01:15  484449 52481        B
  7: 2021-05-20 02:29:43  484449 59685        B
  8: 2021-05-24 13:54:15  484449 59597        B
  9: 2021-05-30 04:11:33  484449 59625        B
 10: 2021-06-03 03:38:52  484449 52448        B
 11: 2021-06-06 01:40:47  484449 59607        B
 12: 2021-06-09 04:05:07  484449 59597        B
 13: 2021-06-13 22:55:06  484449 52476        B
 14: 2021-06-18 15:19:01  484449 52452        B
 15: 2021-06-30 01:12:33  484449 52443        B
 16: 2021-08-13 19:04:24  484449 52490        B
 17: 2021-09-07 19:41:25  484449 52452        B
 18: 2021-11-19 19:36:32  484449 59607        B
 19: 2021-12-02 19:37:50  484449 52484        B
 20: 2021-03-31 07:50:39  484450 52478        C
 21: 2021-04-16 02:25:56  484450 52459        C
 22: 2021-04-26 23:12:17  484450 52476        C
 23: 2021-05-03 15:00:23  484450 52452        C
 24: 2021-05-12 05:50:45  484450 52456        C
 25: 2021-05-18 15:00:19  484450 52448        C
 26: 2021-05-28 22:52:07  484450 52476        C
 27: 2021-06-05 15:29:18  484450 52490        C
 28: 2021-06-12 11:18:47  484450 52461        C
 29: 2021-06-19 04:10:22  484450 52452        C
 30: 2021-06-30 10:11:13  484450 59597        C
 31: 2021-08-14 08:21:32  484450 52488        C
 32: 2021-06-28 15:54:43  484451 52481        A
 33: 2021-07-04 12:44:51  484453 52465        E
 34: 2021-02-14 22:28:41  484455 52430        I
 35: 2021-03-14 15:14:28  484455 52434        I
 36: 2021-04-16 05:10:42  484455 52553        I
 37: 2021-04-20 00:52:10  484455 52545        I
 38: 2021-04-23 06:11:06  484455 52433        I
 39: 2021-04-26 16:45:59  484455 52497        I
 40: 2021-04-30 03:43:53  484455 52439        I
 41: 2021-05-03 11:11:58  484455 52535        I
 42: 2021-05-07 15:27:46  484455 59617        I
 43: 2021-05-11 05:51:15  484455 59620        I
 44: 2021-05-14 06:06:39  484455 59686        I
 45: 2021-05-17 07:08:14  484455 52553        I
 46: 2021-05-20 08:26:21  484455 54225        I
 47: 2021-05-23 16:48:55  484455 54225        I
 48: 2021-05-27 23:53:04  484455 50313        I
 49: 2021-05-31 17:05:24  484455 50313        I
 50: 2021-06-03 21:19:24  484455 52549        I
 51: 2021-06-07 00:20:27  484455 52549        I
 52: 2021-06-10 16:06:36  484455 59620        I
 53: 2021-06-13 20:26:05  484455 28726        I
 54: 2021-06-17 04:28:53  484455 52436        I
 55: 2021-06-20 15:26:44  484455 56724        I
 56: 2021-06-24 05:24:39  484455 59683        I
 57: 2021-06-28 13:57:09  484455 52436        I
 58: 2021-07-02 12:13:23  484455 50315        I
 59: 2021-07-05 23:31:31  484455 59683        I
 60: 2021-07-09 06:34:07  484455 59684        I
 61: 2021-07-13 17:30:26  484455 52555        I
 62: 2021-07-17 19:38:44  484455 52431        I
 63: 2021-07-21 17:24:29  484455 28723        I
 64: 2021-07-26 19:54:42  484455 59683        I
 65: 2021-07-31 15:53:39  484455 52431        I
 66: 2021-08-05 05:57:23  484455 52431        I
 67: 2021-08-09 14:34:04  484455 52497        I
 68: 2021-08-13 20:34:30  484455 52559        I
 69: 2021-08-17 21:23:28  484455 52497        I
 70: 2021-08-21 04:37:26  484455 52541        I
 71: 2021-08-24 03:23:49  484455 52523        I
 72: 2021-08-28 11:22:34  484455 59684        I
 73: 2021-08-31 23:20:26  484455 52436        I
 74: 2021-09-08 03:54:07  484455 59683        I
 75: 2021-09-11 23:29:38  484455 59684        I
 76: 2021-09-16 00:53:41  484455 59684        I
 77: 2021-09-25 02:26:09  484455 52431        I
 78: 2021-09-29 22:28:42  484455 59617        I
 79: 2021-10-05 03:27:17  484455 52436        I
 80: 2021-10-08 23:44:34  484455 52436        I
 81: 2021-10-12 11:56:38  484455 52555        I
 82: 2021-10-20 02:28:34  484455 52436        I
 83: 2021-11-05 19:02:55  484455 59684        I
 84: 2021-03-09 05:58:50  484456 52497        H
 85: 2021-04-13 16:46:42  484456 59617        H
 86: 2021-04-18 14:34:43  484456 52522        H
 87: 2021-04-25 08:38:19  484456 52497        H
 88: 2021-05-06 00:53:20  484456 52549        H
 89: 2021-05-12 16:45:31  484456 52535        H
 90: 2021-05-21 23:56:06  484456 59686        H
 91: 2021-06-05 20:38:45  484456 52496        H
 92: 2021-06-16 14:57:06  484456 52436        H
 93: 2021-07-05 14:54:37  484456 59617        H
 94: 2021-07-25 23:25:06  484456 52497        H
 95: 2021-08-15 11:22:27  484456 59682        H
 96: 2021-08-31 21:25:10  484456 52436        H
 97: 2021-10-03 18:53:57  484456 59683        H
 98: 2021-11-08 16:03:34  484456 59685        H
 99: 2021-02-16 01:15:13  484457 52430        G
100: 2021-03-06 00:28:03  484457 52438        G
                DateTime Station    ID Location