在某个数值阈值 R 内按列合并数据帧
Merging dataframes by column within some numerical threshold R
我需要按两列(participant 和 time)合并两个数据帧,其中 time 匹配可能接近但不精确 (plus/minus 0.001)。下面是两个示例数据帧,其中前三个时间接近匹配,其他时间完全匹配。 Fuzzy_join 通常不起作用,因为这些是大数据帧...
df1 <- data.frame("participant" = c("1", "1", "1", "1", "1", "1", "1", "1",
"2", "2", "2", "2", "2", "2", "2", "2"),
"item" = c("a", "b", "c", "d", "e", "f", "g", "h",
"i", "j", "k", "l", "m", "n", "o", "p"),
"time" = c("43.565", "54.125", "65.923", "73.858", "111.123", "143.124", "255.500", "255.502",
"300.595", "350.252", "400.600", "511.122", "525.887", "577.752", "599.129", "601.992"))
df2 <- data.frame("participant" = c("1", "1", "1", "1", "1", "1", "1", "1",
"2", "2", "2", "2", "2", "2", "2", "2"),
"value" = c("xyz", "hlm", "test", "nop", "test", "nop", "hlm", "test",
"hlm", "test", "xyz", "xyz", "test", "xyz", "nop", "xyz"),
"time" = c("43.566", "54.124", "65.922", "73.858", "111.123", "143.124", "255.500", "255.502",
"300.595", "350.252", "400.600", "511.122", "525.887", "577.752", "599.129", "601.992"))
为什么不用fuzzy_join
?看起来挺快的,想不出有什么比这更快的了。
在 fuzzy_join 操作之前,我必须将字符变量“时间”转换为数字。
df1 <- df1 %>%
readr::type_convert()
df2 <- df2%>%
readr::type_convert()
library(fuzzyjoin)
fuzzy_join(df1, df2, by = "time",
match_fun = ~ abs(.x - .y) < 0.002)
participant.x item time.x participant.y value time.y
1 1 a 43.565 1 xyz 43.566
2 1 b 54.125 1 hlm 54.124
3 1 c 65.923 1 test 65.922
4 1 d 73.858 1 nop 73.858
5 1 e 111.123 1 test 111.123
6 1 f 143.124 1 nop 143.124
7 1 g 255.500 1 hlm 255.500
8 1 h 255.502 1 test 255.502
9 2 i 300.595 2 hlm 300.595
10 2 j 350.252 2 test 350.252
11 2 k 400.600 2 xyz 400.600
12 2 l 511.122 2 xyz 511.122
13 2 m 525.887 2 test 525.887
14 2 n 577.752 2 xyz 577.752
15 2 o 599.129 2 nop 599.129
16 2 p 601.992 2 xyz 601.992
编辑
OP 要求匹配多列的函数。
如果我们想要多对列,我们可以使用e个列向量来匹配,以及一个匹配函数列表,如:
fuzzy_join(df1, df2, by = c("participant", "time"),
match_fun = list(`==`,
\(x,y) abs(x - y) < 0.002)
)
participant.x item time.x participant.y value time.y
1 1 a 43.565 1 xyz 43.566
2 1 b 54.125 1 hlm 54.124
3 1 c 65.923 1 test 65.922
4 1 d 73.858 1 nop 73.858
5 1 e 111.123 1 test 111.123
6 1 f 143.124 1 nop 143.124
7 1 g 255.500 1 hlm 255.500
8 1 h 255.502 1 test 255.502
9 2 i 300.595 2 hlm 300.595
10 2 j 350.252 2 test 350.252
11 2 k 400.600 2 xyz 400.600
12 2 l 511.122 2 xyz 511.122
13 2 m 525.887 2 test 525.887
14 2 n 577.752 2 xyz 577.752
15 2 o 599.129 2 nop 599.129
16 2 p 601.992 2 xyz 601.992
这可能是一个不太优雅的解决方案,但希望能满足您的需求,并且足够容易理解,以便您根据需要进行调整。
df1$time <- as.numeric(df1$time)
df2$time <- as.numeric(df2$time)
df1$value <- NA
for(i in 1:nrow(df1)){
df2_participant <- df2[df2$participant == df1$participant[i],] #subset df2 to only the rows with matching participant id
for(j in 1:nrow(df2_participant)){
match_cond <- abs(df2_participant$time[j] - df1$time[i]) < 0.002
df1$value[i] <- ifelse(match_cond, df2_participant$value[j], NA)
if(match_cond){
break
}
}
}
这使用 df1
作为“主”数据框并从 d2
添加任何相应的 value
。一旦找到匹配的 participant/time,它就会移动到 df1
.
的下一行
EDIT/UPDATE
结合使用我之前的建议和 GuedesBF 的其他建议,您可能会两全其美。而且由于您确实有很多额外的列,如果您确实有内存问题,我会尝试将其缩减为仅必要的列 - 您应该能够很容易地将它们合并回结果中。
library(fuzzyjoin)
library(tidyverse)
df1 <- df1 %>%
readr::type_convert()
df2 <- df2%>%
readr::type_convert()
participants <- unique(df1$participant)
result <- data.frame('participant.x'=NA,'item'=NA,'time.x'=NA,'participant.y'=NA,'value'=NA,'time.y'=NA)
for(i in 1:length(participants)){
sub_df1 <- df1[df1$participant == participants[i],]
sub_df2 <- df2[df2$participant == participants[i],]
sub_result <- fuzzy_join(sub_df1, sub_df2, by = "time",
match_fun = ~ abs(.x - .y) < 0.002, mode='left')
result <- rbind(result, sub_result)
}
final_result <- result[-1,]
final_result
我需要按两列(participant 和 time)合并两个数据帧,其中 time 匹配可能接近但不精确 (plus/minus 0.001)。下面是两个示例数据帧,其中前三个时间接近匹配,其他时间完全匹配。 Fuzzy_join 通常不起作用,因为这些是大数据帧...
df1 <- data.frame("participant" = c("1", "1", "1", "1", "1", "1", "1", "1",
"2", "2", "2", "2", "2", "2", "2", "2"),
"item" = c("a", "b", "c", "d", "e", "f", "g", "h",
"i", "j", "k", "l", "m", "n", "o", "p"),
"time" = c("43.565", "54.125", "65.923", "73.858", "111.123", "143.124", "255.500", "255.502",
"300.595", "350.252", "400.600", "511.122", "525.887", "577.752", "599.129", "601.992"))
df2 <- data.frame("participant" = c("1", "1", "1", "1", "1", "1", "1", "1",
"2", "2", "2", "2", "2", "2", "2", "2"),
"value" = c("xyz", "hlm", "test", "nop", "test", "nop", "hlm", "test",
"hlm", "test", "xyz", "xyz", "test", "xyz", "nop", "xyz"),
"time" = c("43.566", "54.124", "65.922", "73.858", "111.123", "143.124", "255.500", "255.502",
"300.595", "350.252", "400.600", "511.122", "525.887", "577.752", "599.129", "601.992"))
为什么不用fuzzy_join
?看起来挺快的,想不出有什么比这更快的了。
在 fuzzy_join 操作之前,我必须将字符变量“时间”转换为数字。
df1 <- df1 %>%
readr::type_convert()
df2 <- df2%>%
readr::type_convert()
library(fuzzyjoin)
fuzzy_join(df1, df2, by = "time",
match_fun = ~ abs(.x - .y) < 0.002)
participant.x item time.x participant.y value time.y
1 1 a 43.565 1 xyz 43.566
2 1 b 54.125 1 hlm 54.124
3 1 c 65.923 1 test 65.922
4 1 d 73.858 1 nop 73.858
5 1 e 111.123 1 test 111.123
6 1 f 143.124 1 nop 143.124
7 1 g 255.500 1 hlm 255.500
8 1 h 255.502 1 test 255.502
9 2 i 300.595 2 hlm 300.595
10 2 j 350.252 2 test 350.252
11 2 k 400.600 2 xyz 400.600
12 2 l 511.122 2 xyz 511.122
13 2 m 525.887 2 test 525.887
14 2 n 577.752 2 xyz 577.752
15 2 o 599.129 2 nop 599.129
16 2 p 601.992 2 xyz 601.992
编辑
OP 要求匹配多列的函数。 如果我们想要多对列,我们可以使用e个列向量来匹配,以及一个匹配函数列表,如:
fuzzy_join(df1, df2, by = c("participant", "time"),
match_fun = list(`==`,
\(x,y) abs(x - y) < 0.002)
)
participant.x item time.x participant.y value time.y
1 1 a 43.565 1 xyz 43.566
2 1 b 54.125 1 hlm 54.124
3 1 c 65.923 1 test 65.922
4 1 d 73.858 1 nop 73.858
5 1 e 111.123 1 test 111.123
6 1 f 143.124 1 nop 143.124
7 1 g 255.500 1 hlm 255.500
8 1 h 255.502 1 test 255.502
9 2 i 300.595 2 hlm 300.595
10 2 j 350.252 2 test 350.252
11 2 k 400.600 2 xyz 400.600
12 2 l 511.122 2 xyz 511.122
13 2 m 525.887 2 test 525.887
14 2 n 577.752 2 xyz 577.752
15 2 o 599.129 2 nop 599.129
16 2 p 601.992 2 xyz 601.992
这可能是一个不太优雅的解决方案,但希望能满足您的需求,并且足够容易理解,以便您根据需要进行调整。
df1$time <- as.numeric(df1$time)
df2$time <- as.numeric(df2$time)
df1$value <- NA
for(i in 1:nrow(df1)){
df2_participant <- df2[df2$participant == df1$participant[i],] #subset df2 to only the rows with matching participant id
for(j in 1:nrow(df2_participant)){
match_cond <- abs(df2_participant$time[j] - df1$time[i]) < 0.002
df1$value[i] <- ifelse(match_cond, df2_participant$value[j], NA)
if(match_cond){
break
}
}
}
这使用 df1
作为“主”数据框并从 d2
添加任何相应的 value
。一旦找到匹配的 participant/time,它就会移动到 df1
.
EDIT/UPDATE
结合使用我之前的建议和 GuedesBF 的其他建议,您可能会两全其美。而且由于您确实有很多额外的列,如果您确实有内存问题,我会尝试将其缩减为仅必要的列 - 您应该能够很容易地将它们合并回结果中。
library(fuzzyjoin)
library(tidyverse)
df1 <- df1 %>%
readr::type_convert()
df2 <- df2%>%
readr::type_convert()
participants <- unique(df1$participant)
result <- data.frame('participant.x'=NA,'item'=NA,'time.x'=NA,'participant.y'=NA,'value'=NA,'time.y'=NA)
for(i in 1:length(participants)){
sub_df1 <- df1[df1$participant == participants[i],]
sub_df2 <- df2[df2$participant == participants[i],]
sub_result <- fuzzy_join(sub_df1, sub_df2, by = "time",
match_fun = ~ abs(.x - .y) < 0.002, mode='left')
result <- rbind(result, sub_result)
}
final_result <- result[-1,]
final_result