基于R中的多个最近匹配合并数据帧

merging data frames based on multiple nearest matches in R

我有一个大的(例如减少和简化的)数据框 (df1) 和一个较小的数据框 (df2)。我的目标是 return 每个变量 a:d 的 df1 的单行与 df2 中每个 a:d 的值最匹配,这样我就有了 julian(索引)和df1.

中与 a-d 最接近的匹配项

下面示例代码的 julian 的结果值类似于 9,20,10,24。

我已经用 dplyr 尝试了各种方法,例如 inner_join 但还没有找到完成 approximate/nearest 匹配的方法。我确实在 data.table 中看到了使用 roll = "nearest" 解决类似问题的方法,但它似乎需要在单个列上设置一个键,我不太明白如何适应我的问题。

df1=structure(list(julian = 1:25, a = c(1.23651000172524, 3.53768533577433, 
4.67886484270661, 5.58620230847292, 6.50334496193416, 8.39024634778858, 
8.07046072833857, 10.3224760103505, 12.0294821656441, 13.0051126737986, 
13.6182967988946, 16.0093802756876, 16.3629833376647, 16.4032251800014, 
20.7793611866438, 22.8896881514255, 24.9047753769525, 27.693270690425, 
28.1354667109412, 28.7461982083997, 29.4226852088927, 30.0674285282579, 
30.5385160713231, 30.2693063298828, 31.2137034020167), b = c(0.0818659903121299, 
3.16872244667597, 5.18716715995415, 8.01357800633546, 9.03929283020061, 
9.75879226945875, 10.6840909493883, 11.993517678357, 13.5471390532808, 
14.5735221374674, 14.5629644871486, 14.3256165384972, 16.1055271965235, 
16.0846958235405, 16.2822070785324, 17.5707074083182, 18.0211458028877, 
18.2487504080849, 18.2748191242643, 20.2487076102867, 22.4499741093445, 
22.7261732975307, 23.3057791855779, 24.205539951076, 26.8265110564906
), c = c(1.53338066501425, 1.79175662364937, 3.87114113188336, 
7.25916181355392, 7.27167564643793, 9.45150932098826, 9.79130847937593, 
8.74357825723914, 7.65948191832803, 8.29615547570692, 10.0896110687571, 
10.622117084693, 12.8381085732359, 12.2479308117301, 14.1459672328209, 
15.1971928224078, 16.2405218836198, 17.6742310372874, 18.2592891390671, 
18.5239348003001, 17.8195001436016, 18.2938114977865, 19.3566862831869, 
20.2531494246964, 21.2021100407786), d = c(1.06048990933208, 
1.19478517029592, 2.01700007570706, 2.89122927429884, 2.84927053502588, 
1.99042435941095, 2.84730950732035, 5.24044843997076, 6.78411332171822, 
6.94262185334297, 8.46835997983591, 9.00847980399115, 10.0903648270813, 
12.7882885080826, 13.5944586952418, 13.8267656264867, 14.7989016465317, 
15.7181393355348, 17.3822790373397, 17.8017108178726, 19.8442415003192, 
21.1743994279903, 20.9854326674201, 22.0177140097714, 24.0917743668801
)), class = "data.frame", row.names = c(NA, -25L))

 df2=structure(list(a = 12.35, b = 19.63, c = 8.3, d = 22.5), class = "data.frame", row.names = c(NA, 
-1L))

下面使用Mapdf1中的每一列a:d与df2中对应的参考值进行比较,找到最小值的索引,最后,将其用作 df1

的“julian”列的索引
library(tidyverse)

indices <- Map(`-`, df1[-1], df2) %>% 
  lapply(function(x) which(abs(x) == min(abs(x)))) %>% 
  sapply(function(x) df1[['julian']][x])

 a  b  c  d 
 9 20 10 24 

如果你想过滤 df1:

df1 %>% 
  filter(julian %in% indices)

  julian        a        b         c         d
1      9 12.02948 13.54714  7.659482  6.784113
2     10 13.00511 14.57352  8.296155  6.942622
3     20 28.74620 20.24871 18.523935 17.801711
4     24 30.26931 24.20554 20.253149 22.017714

在不知道您希望如何格式化结果的情况下,您可以使用您提到的 data.tableroll="nearest" 的滚动连接来做到这一点。

在这种情况下,我已将两组数据melt编辑为长数据集,以便可以在单个连接中完成匹配。

library(data.table)
setDT(df1)
setDT(df2)

df1[
    match(
        melt(df1, id.vars="julian")[
            melt(df2, measure.vars=names(df2)),
            on=c("variable","value"), roll="nearest"]$julian,
        julian),
]
#   julian        a        b         c         d
#1:      9 12.02948 13.54714  7.659482  6.784113
#2:     20 28.74620 20.24871 18.523935 17.801711
#3:     10 13.00511 14.57352  8.296155  6.942622
#4:     24 30.26931 24.20554 20.253149 22.017714

如果您想为每个连接使用单独的表,您可以这样做:

lapply(names(df2), \(var)  df1[df2, on=var, roll="nearest", .SD, .SDcols=names(df1)] )