使用 grep 按多列连接数据框
Join dataframes by multiple columns with grep
我想对基于 2 列的两个数据框进行完全连接,其中 1 列包含在另一列中找到的字符串。以下是我的两个数据框:
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc")
desc1<-c("alpha", "beta", "gamma"
df1<-data.frame(date, site, desc1)
df1
date site desc1
1 2010-11-01 abcejams.com alpha
2 2008-03-25 reitimes.com beta
3 2007-03-14 posehbc gamma
date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2018-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)
df2
date2 site2 metric2 metric3
1 2010-11-01 jams 1 10
2 2008-03-25 time 2 20
3 2007-03-14 pose 3 30
4 2018-02-09 abce 4 40
我想按日期和站点加入此站点,前提是站点 2 按日期在站点中。如果没有 grep 部分,您通常会这样做。
finaldf<-full_join(df1, df2, by = c("date"="date2", "site" = "site2"))
有一种方法可以使用 sqldf 执行此操作,但唯一的选择是左连接而不是全连接:
test<-sqldf("df1.*, df2.metric2,
df2.metric3
from df1
left join df2
on
instr(df1.site, df2.site2)
and
df1.date=df2.date2")
目标是让最终输出看起来像这样:
date site desc1 metric2 metric3
1 2010-11-01 abcejams.com alpha 1 10
2 2008-03-25 reitimes.com beta 2 20
3 2007-03-14 posehbc gamma 3 30
4 2018-02-09 abce NA 4 40
有人对此有经验吗?
您可以使用 fuzzyjoin
包并使用 regex_full_join
。我不相信它现在在 CRAN 上,所以检查 github page 来安装它。
library(fuzzyjoin)
date <- as.Date(c('2010-11-1', '2008-3-25', '2007-3-14'))
site <- c("abcejams.com", "reitimes.com", "posehbc")
df1 <- data.frame(date, site, stringsAsFactors = FALSE)
date2 <- as.Date(c('2010-11-1', '2008-3-25', '2007-3-14', '2018-2-9'))
site2 <- c("jams", "time", "pose", "abce")
metric2 <- c(1, 2, 3, 4)
metric3 <- c(10, 20, 30, 40)
df2 <- data.frame(date2, site2, metric2, metric3, stringsAsFactors = FALSE)
regex_full_join(df1, df2, by = c("site" = "site2", "date" = "date2"))
date site date2 site2 metric2 metric3
1 2010-11-01 abcejams.com 2010-11-01 jams 1 10
2 2008-03-25 reitimes.com 2008-03-25 time 2 20
3 2007-03-14 posehbc 2007-03-14 pose 3 30
4 <NA> <NA> 2018-02-09 abce 4 40
# original data
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc")
desc1<-c("alpha", "beta", "gamma")
df1<-data.frame(date, site, desc1)
date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2018-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)
我写了一个函数,它根据两个条件 TRUE
来定义组。
library(tidyverse)
library(stringr)
make_groups <- function(bicond) {
lapply(apply(bicond, 1, function(i) which(i == 1)), function(i) if (length(i)==0) { NA } else { i })
}
custom_join <- function(df1, df2) {
cond1 <- outer(df2$date2, df1$date, "==")
cond2 <- outer(as.character(df2$site2), as.character(df1$site), function(i, j) str_detect(j, i))
bicond <- cond1 * cond2
data1 <- df1 %>% mutate(G = row_number())
data2 <- df2 %>% mutate(G = make_groups(bicond)) %>% unnest(G)
full_join(data2, data1, by=c("G" = "G", "date2" = "date")) %>% select(-G)
}
custom_join(df1, df2)
# date2 site2 metric2 metric3 site desc1
# 1 2010-11-01 jams 1 10 abcejams.com alpha
# 2 2008-03-25 time 2 20 reitimes.com beta
# 3 2007-03-14 pose 3 30 posehbc gamma
# 4 2018-02-09 abce 4 40 <NA> <NA>
另一个例子
# new data
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc", "poseur")
desc1<-c("alpha", "beta", "gamma", "epsilon")
df1<-data.frame(date, site, desc1)
date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2007-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)
custom_join(df1, df2)
# date2 site2 metric2 metric3 site desc1
# 1 2010-11-01 jams 1 10 abcejams.com alpha
# 2 2008-03-25 time 2 20 reitimes.com beta
# 3 2007-03-14 pose 3 30 posehbc gamma
# 4 2007-03-14 pose 3 30 poseur epsilon
# 5 2007-02-09 abce 4 40 <NA> <NA>
我想对基于 2 列的两个数据框进行完全连接,其中 1 列包含在另一列中找到的字符串。以下是我的两个数据框:
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc")
desc1<-c("alpha", "beta", "gamma"
df1<-data.frame(date, site, desc1)
df1
date site desc1
1 2010-11-01 abcejams.com alpha
2 2008-03-25 reitimes.com beta
3 2007-03-14 posehbc gamma
date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2018-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)
df2
date2 site2 metric2 metric3
1 2010-11-01 jams 1 10
2 2008-03-25 time 2 20
3 2007-03-14 pose 3 30
4 2018-02-09 abce 4 40
我想按日期和站点加入此站点,前提是站点 2 按日期在站点中。如果没有 grep 部分,您通常会这样做。
finaldf<-full_join(df1, df2, by = c("date"="date2", "site" = "site2"))
有一种方法可以使用 sqldf 执行此操作,但唯一的选择是左连接而不是全连接:
test<-sqldf("df1.*, df2.metric2,
df2.metric3
from df1
left join df2
on
instr(df1.site, df2.site2)
and
df1.date=df2.date2")
目标是让最终输出看起来像这样:
date site desc1 metric2 metric3
1 2010-11-01 abcejams.com alpha 1 10
2 2008-03-25 reitimes.com beta 2 20
3 2007-03-14 posehbc gamma 3 30
4 2018-02-09 abce NA 4 40
有人对此有经验吗?
您可以使用 fuzzyjoin
包并使用 regex_full_join
。我不相信它现在在 CRAN 上,所以检查 github page 来安装它。
library(fuzzyjoin)
date <- as.Date(c('2010-11-1', '2008-3-25', '2007-3-14'))
site <- c("abcejams.com", "reitimes.com", "posehbc")
df1 <- data.frame(date, site, stringsAsFactors = FALSE)
date2 <- as.Date(c('2010-11-1', '2008-3-25', '2007-3-14', '2018-2-9'))
site2 <- c("jams", "time", "pose", "abce")
metric2 <- c(1, 2, 3, 4)
metric3 <- c(10, 20, 30, 40)
df2 <- data.frame(date2, site2, metric2, metric3, stringsAsFactors = FALSE)
regex_full_join(df1, df2, by = c("site" = "site2", "date" = "date2"))
date site date2 site2 metric2 metric3
1 2010-11-01 abcejams.com 2010-11-01 jams 1 10
2 2008-03-25 reitimes.com 2008-03-25 time 2 20
3 2007-03-14 posehbc 2007-03-14 pose 3 30
4 <NA> <NA> 2018-02-09 abce 4 40
# original data
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc")
desc1<-c("alpha", "beta", "gamma")
df1<-data.frame(date, site, desc1)
date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2018-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)
我写了一个函数,它根据两个条件 TRUE
来定义组。
library(tidyverse)
library(stringr)
make_groups <- function(bicond) {
lapply(apply(bicond, 1, function(i) which(i == 1)), function(i) if (length(i)==0) { NA } else { i })
}
custom_join <- function(df1, df2) {
cond1 <- outer(df2$date2, df1$date, "==")
cond2 <- outer(as.character(df2$site2), as.character(df1$site), function(i, j) str_detect(j, i))
bicond <- cond1 * cond2
data1 <- df1 %>% mutate(G = row_number())
data2 <- df2 %>% mutate(G = make_groups(bicond)) %>% unnest(G)
full_join(data2, data1, by=c("G" = "G", "date2" = "date")) %>% select(-G)
}
custom_join(df1, df2)
# date2 site2 metric2 metric3 site desc1
# 1 2010-11-01 jams 1 10 abcejams.com alpha
# 2 2008-03-25 time 2 20 reitimes.com beta
# 3 2007-03-14 pose 3 30 posehbc gamma
# 4 2018-02-09 abce 4 40 <NA> <NA>
另一个例子
# new data
date<-as.Date(c('2010-11-1','2008-3-25','2007-3-14','2007-3-14'))
site<-c("abcejams.com", "reitimes.com", "posehbc", "poseur")
desc1<-c("alpha", "beta", "gamma", "epsilon")
df1<-data.frame(date, site, desc1)
date2<-as.Date(c('2010-11-1','2008-3-25','2007-3-14', '2007-2-9'))
site2<-c("jams", "time", "pose", "abce")
metric2<-c(1,2,3,4)
metric3<-c(10,20,30,40)
df2<-data.frame(date2, site2, metric2, metric3)
custom_join(df1, df2)
# date2 site2 metric2 metric3 site desc1
# 1 2010-11-01 jams 1 10 abcejams.com alpha
# 2 2008-03-25 time 2 20 reitimes.com beta
# 3 2007-03-14 pose 3 30 posehbc gamma
# 4 2007-03-14 pose 3 30 poseur epsilon
# 5 2007-02-09 abce 4 40 <NA> <NA>