R - 更有效 left_join

R - more effective left_join

我有两个数据框——一个包含名称和限制范围(只有几百行,最多 1000 行),需要分配给一个“测量”数据框,它可以包含数百万行(或千万行)。

目前我正在做 left_join 和过滤值以获得分配给每个测量的特定限制。然而,这是非常低效的并且耗费了大量资源。对于更大的数据帧,代码甚至无法 运行.

任何关于更有效解决方案的想法都会有所帮助。

library(dplyr)

## this one has got only few houndreds rows
df_limits <- read.table(text="Title station_id  limit_from  limit_to
Level_3_Low 1   0   70
Level_2_Low 1   70  90
Level_1_Low 1   90  100
Optimal 1   100 110
Level_1_High    1   110 130
Level_2_High    1   130 150
Level_3_High    1   150 180
Level_3_Low 2   0   70
Level_2_Low 2   70  90
Level_1_Low 2   90  100
Optimal 2   100 110
Level_1_High    2   110 130
Level_2_High    2   130 150
Level_3_High    2   150 180
Level_3_Low 3   0   70
Level_2_Low 3   70  90
Level_1_Low 3   90  100
Optimal 3   100 110
Level_1_High    3   110 130
Level_2_High    3   130 150
Level_3_High    3   150 180
",header  = TRUE, stringsAsFactors = TRUE)


 # this DF has got millions of rows
df_measurements <- read.table(text="measurement_id  station_id  value
12121534    1   172
12121618    1   87
12121703    1   9
12121709    2   80
12121760    2   80
12121813    2   115
12121881    3   67
12121907    3   100
12121920    3   108
12121979    1   102
12121995    1   53
12122022    1   77
12122065    2   158
12122107    2   144
12122113    2   5
12122135    3   100
12122187    3   136
12122267    3   130
12122359    1   105
12122366    1   126
12122398    1   143
",header  = TRUE, stringsAsFactors = TRUE)


df_results <- left_join(df_measurements,df_limits, by = "station_id") %>% 
              filter ((value >= limit_from & value < limit_to) | is.na(Title)) %>%
              select(names(df_measurements), Title)

这是一种方法。有问题的部分是条件 value < limit_tofoverlaps 检查导致双重匹配的条件 value <= limit_to,因此我们在重叠连接之后调用过滤条件,然后 select 所需的列。请注意,结果与使用 dplyr 生成的 df_results 的顺序不同。

library(data.table)

dt_limits <- as.data.table(df_limits)
dt_measurements <- as.data.table(df_measurements)

setkey(dt_limits, station_id, limit_from, limit_to)

dt_results <- foverlaps(dt_measurements[, value2 := value],
                        dt_limits,
                        by.x = c("station_id", "value", "value2"),
                        type = "within",
                        )[value < limit_to,
                          .(measurement_id , station_id, value, Title)]
dt_results[]
#>     measurement_id station_id value        Title
#>  1:       12121534          1   172 Level_3_High
#>  2:       12121618          1    87  Level_2_Low
#>  3:       12121703          1     9  Level_3_Low
#>  4:       12121709          2    80  Level_2_Low
#>  5:       12121760          2    80  Level_2_Low
#>  6:       12121813          2   115 Level_1_High
#>  7:       12121881          3    67  Level_3_Low
#>  8:       12121907          3   100      Optimal
#>  9:       12121920          3   108      Optimal
#> 10:       12121979          1   102      Optimal
#> 11:       12121995          1    53  Level_3_Low
#> 12:       12122022          1    77  Level_2_Low
#> 13:       12122065          2   158 Level_3_High
#> 14:       12122107          2   144 Level_2_High
#> 15:       12122113          2     5  Level_3_Low
#> 16:       12122135          3   100      Optimal
#> 17:       12122187          3   136 Level_2_High
#> 18:       12122267          3   130 Level_2_High
#> 19:       12122359          1   105      Optimal
#> 20:       12122366          1   126 Level_1_High
#> 21:       12122398          1   143 Level_2_High
#>     measurement_id station_id value        Title

reprex package (v0.3.0)

于 2021-08-09 创建

另一个 data.table 解决方案使用 non-equijoins:

library(data.table)

setDT(df_measurements)
setDT(df_limits) 

df_limits[df_measurements, .(station_id, measurement_id, value, Title),
          on=.(station_id = station_id, limit_from < value, limit_to >= value)]

   station_id measurement_id value        Title
 1:          1       12121534   172 Level_3_High
 2:          1       12121618    87  Level_2_Low
 3:          1       12121703     9  Level_3_Low
 4:          2       12121709    80  Level_2_Low
 5:          2       12121760    80  Level_2_Low
 6:          2       12121813   115 Level_1_High
 7:          3       12121881    67  Level_3_Low
 8:          3       12121907   100  Level_1_Low
 9:          3       12121920   108      Optimal
10:          1       12121979   102      Optimal
11:          1       12121995    53  Level_3_Low
12:          1       12122022    77  Level_2_Low
13:          2       12122065   158 Level_3_High
14:          2       12122107   144 Level_2_High
15:          2       12122113     5  Level_3_Low
16:          3       12122135   100  Level_1_Low
17:          3       12122187   136 Level_2_High
18:          3       12122267   130 Level_1_High
19:          1       12122359   105      Optimal
20:          1       12122366   126 Level_1_High
21:          1       12122398   143 Level_2_High

使用 subset + merge

的简单基础 R(不需要额外的包)选项
subset(
  merge(
    df_measurements,
    df_limits,
    all = TRUE
  ),
  limit_from < value & limit_to >= value
)

给予

    station_id measurement_id value        Title limit_from limit_to
7            1       12121534   172 Level_3_High        150      180
9            1       12121618    87  Level_2_Low         70       90
15           1       12121703     9  Level_3_Low          0       70
23           1       12122022    77  Level_2_Low         70       90
34           1       12122398   143 Level_2_High        130      150
39           1       12121979   102      Optimal        100      110
43           1       12121995    53  Level_3_Low          0       70
54           1       12122366   126 Level_1_High        110      130
60           1       12122359   105      Optimal        100      110
65           2       12121760    80  Level_2_Low         70       90
75           2       12121813   115 Level_1_High        110      130
79           2       12121709    80  Level_2_Low         70       90
91           2       12122065   158 Level_3_High        150      180
97           2       12122107   144 Level_2_High        130      150
99           2       12122113     5  Level_3_Low          0       70
108          3       12121907   100  Level_1_Low         90      100
116          3       12121920   108      Optimal        100      110
124          3       12122267   130 Level_1_High        110      130
127          3       12121881    67  Level_3_Low          0       70
136          3       12122135   100  Level_1_Low         90      100
146          3       12122187   136 Level_2_High        130      150

另一种选择是使用 dplyr

  df_measurements %>%
    group_by(station_id) %>%
    mutate(Title = with(
      df_limits,
      Title[
        findInterval(
          value,
          unique(unlist(cbind(limit_from, limit_to)[station_id == first(.$station_id)])),
          left.open = TRUE
        )
      ]
    )) %>%
    ungroup()

这给出了

# A tibble: 21 x 4
   measurement_id station_id value Title       
            <int>      <int> <int> <fct>
 1       12121534          1   172 Level_3_High
 2       12121618          1    87 Level_2_Low
 3       12121703          1     9 Level_3_Low
 4       12121709          2    80 Level_2_Low
 5       12121760          2    80 Level_2_Low
 6       12121813          2   115 Level_1_High
 7       12121881          3    67 Level_3_Low
 8       12121907          3   100 Level_1_Low
 9       12121920          3   108 Optimal
10       12121979          1   102 Optimal
# ... with 11 more rows

基准测试


f_TIC1 <- function() {
  subset(
    merge(
      df_measurements,
      df_limits,
      all = TRUE
    ),
    limit_from < value & limit_to >= value
  )
}

f_TIC2 <- function() {
  df_measurements %>%
    group_by(station_id) %>%
    mutate(Title = with(
      df_limits,
      Title[
        findInterval(
          value,
          unique(unlist(cbind(limit_from, limit_to)[station_id == first(station_id)])),
          left.open = TRUE
        )
      ]
    )) %>%
    ungroup()
}


dt_limits <- as.data.table(df_limits)
dt_measurements <- as.data.table(df_measurements)
f_Waldi <- function() {
  dt_limits[
    dt_measurements,
    .(station_id, measurement_id, value, Title),
    on = .(station_id, limit_from < value, limit_to >= value)
  ]
}

f_TimTeaFan <- function() {
  setkey(dt_limits, station_id, limit_from, limit_to)
  foverlaps(dt_measurements[, value2 := value],
    dt_limits,
    by.x = c("station_id", "value", "value2"),
    type = "within",
  )[
    value < limit_to,
    .(measurement_id, station_id, value, Title)
  ]
}

你会看到

Unit: relative
          expr      min       lq     mean   median       uq      max neval
      f_TIC1() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
      f_TIC2() 4.848639 4.909985 4.895588 4.942616 5.124704 2.580819   100
     f_Waldi() 3.182027 3.010615 3.069916 3.114160 3.397845 1.698386   100
 f_TimTeaFan() 5.523778 5.112872 5.226145 5.112407 5.745671 2.446987   100