组合多个模糊连接

Combining Multiple Fuzzy Joins

使用R编程语言,我有以下两个表(在我的实际问题中,所有日期都以“因子”类型给我):

table_1 = data.frame(id1 = c("123 A", "123BB", "12 5", "12--5"), id2 = c("11", "12", "14", "13"),
date_1 = c("2010-01-31","2010-01-31", "2015-01-31", "2018-01-31" ))

table_1$id1 = as.factor(table_1$id1)
table_1$id2 = as.factor(table_1$id2)
table_1$date_1 = as.factor(table_1$date_1)

table_2 = data.frame(id1 = c("0123", "1233", "125  .", "125_"), id2 = c("111", "112", "14", "113"),
date_2 = c("2009-01-31","2010-01-31", "2010-01-31", "2010-01-31" ),
date_3 = c("2011-01-31","2010-01-31", "2020-01-31", "2020-01-31" ))


table_2$id1 = as.factor(table_2$id1)
table_2$id2 = as.factor(table_2$id2)
table_2$date_2 = as.factor(table_2$date_2)
table_2$date_3 = as.factor(table_2$date_3)

如果条件 1 或条件 2 为真,我正在尝试执行“内部联接”:

Condition_1

Condition_2

现在,我知道如何分两部分进行操作:

library(dplyr)
library(fuzzyjoin)

part_1 =  stringdist_inner_join(table_1, table_2, by = "id1", max_dist = 2) %>%
  filter(date_1 >= date_2, date_1 <= date_3) 

part_2 =  stringdist_inner_join(table_1, table_2, by = "id2", max_dist = 2) 

combine = rbind(part_1, part_2)

final = combine[!duplicated(combine[c(1,2,3,4,5,6,7)]),]

我的问题

如果我们想在循环中执行此操作,请循环遍历变量部分,即 by

library(purrr)
library(fuzzyjoin)
library(dplyr)
final2 <- map_dfr(c("id1", "id2"),  ~
       stringdist_inner_join(table_1, table_2, by = .x, max_dist = 2)) %>%
       distinct %>%
       arrange(across(everything()))

-正在检查

> all.equal(final %>% 
         arrange(across(everything())), final2)
[1] TRUE