R:"Fuzzy Match" 和 "Between" 语句
R: "Fuzzy Match" and "Between" Statements
我正在使用 R 编程语言。我有下表(注意:所有变量都显示为“因素”):
table_1 = data.frame(id = c("123", "123", "125", "C125-B"),
date_1 = c("2010-01-31","2010-01-31", "2016-01-31", "2018-01-31" ))
table_1$id = as.factor(table_1$id)
table_1$date_1 = as.factor(table_1$date_1)
table_2 = data.frame(id = c("5123", "123 A", "125", "125"),
date_2 = c("2009-01-31","2010-01-31", "2010-01-31", "2010-01-31" ),
date_3 = c("2011-01-31","2010-01-31", "2020-01-31", "2020-01-31" ))
table_2$id = as.factor(table_2$id)
table_2$date_2 = as.factor(table_2$date_2)
table_2$date_3 = as.factor(table_2$date_3)
> table_1
id date_1
1 123 2010-01-31
2 123 2010-01-31
3 125 2016-01-31
4 C125-B 2018-01-31
table_2
id date_2 date_3
1 5123 2009-01-31 2011-01-31
2 123 A 2010-01-31 2010-01-31
3 125 2010-01-31 2020-01-31
4 125 2010-01-31 2020-01-31
我正在尝试在以下条件下“加入”(例如内部加入)此表:
1) if table_1$id "fuzzy equal" table_2$id
和
2) if table_1$date BETWEEN(table_2$date_2,table_2$date_3)
我尝试在 R 中编写以下代码来执行此操作:
library(fuzzyjoin)
stringdist_inner_join(table_1, table_2,
by ="id", distance_col = NULL)
问题:但是我不确定stringdist_inner_join
函数是否可以容纳这种“介于”之间的逻辑。
谁能告诉我怎么做?在 R 中是否有任何其他方法可以实现此目的?
谢谢!
这个怎么样?如果日期存储为日期,我们可以执行 stringdist_inner_join 并在之后进行过滤。这对大多数数据来说应该有足够的性能,如果不是,你应该使用 data.table 而不是 fuzzyjoin。
library(fuzzyjoin)
library(dplyr)
table_1$date_1 = as.Date(table_1$date_1)
table_2$date_2 = as.Date(table_2$date_2)
table_2$date_3 = as.Date(table_2$date_3)
stringdist_inner_join(table_1, table_2, by = "id", max_dist = 2) %>%
filter(date_1 >= date_2, date_1 <= date_3)
id.x date_1 id.y date_2 date_3
1 123 2010-01-31 5123 2009-01-31 2011-01-31
2 123 2010-01-31 123 A 2010-01-31 2010-01-31
3 123 2010-01-31 125 2010-01-31 2020-01-31
4 123 2010-01-31 125 2010-01-31 2020-01-31
5 123 2010-01-31 5123 2009-01-31 2011-01-31
6 123 2010-01-31 123 A 2010-01-31 2010-01-31
7 123 2010-01-31 125 2010-01-31 2020-01-31
8 123 2010-01-31 125 2010-01-31 2020-01-31
9 125 2016-01-31 125 2010-01-31 2020-01-31
10 125 2016-01-31 125 2010-01-31 2020-01-31
我正在使用 R 编程语言。我有下表(注意:所有变量都显示为“因素”):
table_1 = data.frame(id = c("123", "123", "125", "C125-B"),
date_1 = c("2010-01-31","2010-01-31", "2016-01-31", "2018-01-31" ))
table_1$id = as.factor(table_1$id)
table_1$date_1 = as.factor(table_1$date_1)
table_2 = data.frame(id = c("5123", "123 A", "125", "125"),
date_2 = c("2009-01-31","2010-01-31", "2010-01-31", "2010-01-31" ),
date_3 = c("2011-01-31","2010-01-31", "2020-01-31", "2020-01-31" ))
table_2$id = as.factor(table_2$id)
table_2$date_2 = as.factor(table_2$date_2)
table_2$date_3 = as.factor(table_2$date_3)
> table_1
id date_1
1 123 2010-01-31
2 123 2010-01-31
3 125 2016-01-31
4 C125-B 2018-01-31
table_2
id date_2 date_3
1 5123 2009-01-31 2011-01-31
2 123 A 2010-01-31 2010-01-31
3 125 2010-01-31 2020-01-31
4 125 2010-01-31 2020-01-31
我正在尝试在以下条件下“加入”(例如内部加入)此表:
1) if table_1$id "fuzzy equal" table_2$id
和
2) if table_1$date BETWEEN(table_2$date_2,table_2$date_3)
我尝试在 R 中编写以下代码来执行此操作:
library(fuzzyjoin)
stringdist_inner_join(table_1, table_2,
by ="id", distance_col = NULL)
问题:但是我不确定stringdist_inner_join
函数是否可以容纳这种“介于”之间的逻辑。
谁能告诉我怎么做?在 R 中是否有任何其他方法可以实现此目的?
谢谢!
这个怎么样?如果日期存储为日期,我们可以执行 stringdist_inner_join 并在之后进行过滤。这对大多数数据来说应该有足够的性能,如果不是,你应该使用 data.table 而不是 fuzzyjoin。
library(fuzzyjoin)
library(dplyr)
table_1$date_1 = as.Date(table_1$date_1)
table_2$date_2 = as.Date(table_2$date_2)
table_2$date_3 = as.Date(table_2$date_3)
stringdist_inner_join(table_1, table_2, by = "id", max_dist = 2) %>%
filter(date_1 >= date_2, date_1 <= date_3)
id.x date_1 id.y date_2 date_3
1 123 2010-01-31 5123 2009-01-31 2011-01-31
2 123 2010-01-31 123 A 2010-01-31 2010-01-31
3 123 2010-01-31 125 2010-01-31 2020-01-31
4 123 2010-01-31 125 2010-01-31 2020-01-31
5 123 2010-01-31 5123 2009-01-31 2011-01-31
6 123 2010-01-31 123 A 2010-01-31 2010-01-31
7 123 2010-01-31 125 2010-01-31 2020-01-31
8 123 2010-01-31 125 2010-01-31 2020-01-31
9 125 2016-01-31 125 2010-01-31 2020-01-31
10 125 2016-01-31 125 2010-01-31 2020-01-31