在某个数值阈值 R 内按列合并数据帧

Question

我需要按两列（participant 和 time）合并两个数据帧，其中 time 匹配可能接近但不精确 (plus/minus 0.001)。下面是两个示例数据帧，其中前三个时间接近匹配，其他时间完全匹配。 Fuzzy_join 通常不起作用，因为这些是大数据帧...

df1 <- data.frame("participant" = c("1", "1", "1", "1", "1", "1", "1", "1", 
                               "2", "2", "2", "2", "2", "2", "2", "2"), 
             "item" = c("a", "b", "c", "d", "e", "f", "g", "h",
                        "i", "j", "k", "l", "m", "n", "o", "p"),
             "time" = c("43.565", "54.125", "65.923", "73.858", "111.123", "143.124", "255.500", "255.502",
                        "300.595", "350.252", "400.600", "511.122", "525.887", "577.752", "599.129", "601.992"))

df2 <- data.frame("participant" = c("1", "1", "1", "1", "1", "1", "1", "1", 
                                   "2", "2", "2", "2", "2", "2", "2", "2"), 
                 "value" = c("xyz", "hlm", "test", "nop", "test", "nop", "hlm", "test",
                             "hlm", "test", "xyz", "xyz", "test", "xyz", "nop", "xyz"),
                 "time" = c("43.566", "54.124", "65.922", "73.858", "111.123", "143.124", "255.500", "255.502",
                            "300.595", "350.252", "400.600", "511.122", "525.887", "577.752", "599.129", "601.992"))

Answer 1

为什么不用fuzzy_join？看起来挺快的，想不出有什么比这更快的了。在 fuzzy_join 操作之前，我必须将字符变量“时间”转换为数字。

df1 <- df1 %>%
    readr::type_convert()
df2 <- df2%>%
    readr::type_convert()

library(fuzzyjoin)

fuzzy_join(df1, df2, by = "time",
           match_fun = ~ abs(.x - .y) < 0.002)

   participant.x item  time.x participant.y value  time.y
1              1    a  43.565             1   xyz  43.566
2              1    b  54.125             1   hlm  54.124
3              1    c  65.923             1  test  65.922
4              1    d  73.858             1   nop  73.858
5              1    e 111.123             1  test 111.123
6              1    f 143.124             1   nop 143.124
7              1    g 255.500             1   hlm 255.500
8              1    h 255.502             1  test 255.502
9              2    i 300.595             2   hlm 300.595
10             2    j 350.252             2  test 350.252
11             2    k 400.600             2   xyz 400.600
12             2    l 511.122             2   xyz 511.122
13             2    m 525.887             2  test 525.887
14             2    n 577.752             2   xyz 577.752
15             2    o 599.129             2   nop 599.129
16             2    p 601.992             2   xyz 601.992

编辑

OP 要求匹配多列的函数。如果我们想要多对列，我们可以使用e个列向量来匹配，以及一个匹配函数列表，如：

fuzzy_join(df1, df2, by = c("participant", "time"),
           match_fun = list(`==`,
                            \(x,y) abs(x - y) < 0.002)
           )

   participant.x item  time.x participant.y value  time.y
1              1    a  43.565             1   xyz  43.566
2              1    b  54.125             1   hlm  54.124
3              1    c  65.923             1  test  65.922
4              1    d  73.858             1   nop  73.858
5              1    e 111.123             1  test 111.123
6              1    f 143.124             1   nop 143.124
7              1    g 255.500             1   hlm 255.500
8              1    h 255.502             1  test 255.502
9              2    i 300.595             2   hlm 300.595
10             2    j 350.252             2  test 350.252
11             2    k 400.600             2   xyz 400.600
12             2    l 511.122             2   xyz 511.122
13             2    m 525.887             2  test 525.887
14             2    n 577.752             2   xyz 577.752
15             2    o 599.129             2   nop 599.129
16             2    p 601.992             2   xyz 601.992

Answer 2

这可能是一个不太优雅的解决方案，但希望能满足您的需求，并且足够容易理解，以便您根据需要进行调整。

df1$time <- as.numeric(df1$time)
df2$time <- as.numeric(df2$time)
df1$value <- NA

for(i in  1:nrow(df1)){
  df2_participant <- df2[df2$participant == df1$participant[i],] #subset df2 to only the rows with matching participant id
  for(j in 1:nrow(df2_participant)){
    match_cond <- abs(df2_participant$time[j] - df1$time[i]) < 0.002
    df1$value[i] <- ifelse(match_cond, df2_participant$value[j], NA)
    if(match_cond){
      break
    }
  }
}

这使用 df1 作为“主”数据框并从 d2 添加任何相应的 value。一旦找到匹配的 participant/time，它就会移动到 df1.

的下一行

EDIT/UPDATE

结合使用我之前的建议和 GuedesBF 的其他建议，您可能会两全其美。而且由于您确实有很多额外的列，如果您确实有内存问题，我会尝试将其缩减为仅必要的列 - 您应该能够很容易地将它们合并回结果中。

library(fuzzyjoin)
library(tidyverse)
df1 <- df1 %>%
  readr::type_convert()
df2 <- df2%>%
  readr::type_convert()

participants <- unique(df1$participant)

result <- data.frame('participant.x'=NA,'item'=NA,'time.x'=NA,'participant.y'=NA,'value'=NA,'time.y'=NA)

for(i in 1:length(participants)){
  sub_df1 <- df1[df1$participant == participants[i],]
  sub_df2 <- df2[df2$participant == participants[i],]
  
  sub_result <- fuzzy_join(sub_df1, sub_df2, by = "time",
             match_fun = ~ abs(.x - .y) < 0.002, mode='left')
  
  result <- rbind(result, sub_result)
  
}

final_result <- result[-1,]
final_result

在某个数值阈值 R 内按列合并数据帧

Merging dataframes by column within some numerical threshold R

merge

r