如何使用 IRanges 一次模糊连接多个数据帧
How to fuzzyjoin several dataframes in one go using IRanges
我需要加入几个基于不精确匹配的数据帧,这可以使用 fuzzyjoin
和 IRanges
包来实现:
数据:
df1 <- data.frame(
line = 1:4,
start = c(75,100,170,240),
end = c(100,150,190,300)
)
df2 <- data.frame(
v2 = c("A","B","C","D","E","F","G","H","I","J","K","F"),
start = c(0,10,30,90,120,130,154,161,175,199,205,300),
end = c(10,20,50,110,130,140,160,165,180,250,300,305)
)
df3 <- data.frame(
v3 = c("a","b","c","d","e","f"),
start = c(5,90,200,333,1000,1500),
end = c(75,171,210,400,1001,1600)
)
这里我想根据start
和[=之间的间隔将df2
和df3
加入df1
19=]。我可以做的是分步进行,即通过加入加入:
library(fuzzyjoin)
# install package "IRanges":
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("IRanges")
library(BiocManager)
# First join:
df12 <- interval_left_join(x = df1,
y = df2,
by = c("start", "end")) %>%
select(-c(start.y, end.y)) %>%
rename(start = start.x, end = end.x)
# Second join:
df123 <- interval_left_join(x = df12,
y = df3,
by = c("start", "end")) %>%
select(-c(start.y, end.y)) %>%
rename(start = start.x, end = end.x)
结果:
df123
line start end v2 v3
1 1 75 100 D a
2 1 75 100 D b
3 2 100 150 D b
4 2 100 150 E b
5 2 100 150 F b
6 3 170 190 I b
7 4 240 300 J <NA>
8 4 240 300 K <NA>
9 4 240 300 F <NA>
这一切都很好,但在我的实际数据中,我有多个数据帧要加入,然后逐个加入是不切实际且容易出错的。如何一次对所有数据帧执行连接?
将数据帧放入列表中,然后将数据帧与 Reduce
连接起来。
library(fuzzyjoin)
library(dplyr)
join_two_dataframes <- function(df1, df2) {
interval_left_join(x = df1,
y = df2,
by = c("start", "end")) %>%
select(-c(start.y, end.y)) %>%
rename(start = start.x, end = end.x)
}
list_df <- list(df1, df2, df3)
Reduce(join_two_dataframes, list_df)
# line start end v2 v3
#1 1 75 100 D a
#2 1 75 100 D b
#3 2 100 150 D b
#4 2 100 150 E b
#5 2 100 150 F b
#6 3 170 190 I b
#7 4 240 300 J <NA>
#8 4 240 300 K <NA>
#9 4 240 300 F <NA>
我需要加入几个基于不精确匹配的数据帧,这可以使用 fuzzyjoin
和 IRanges
包来实现:
数据:
df1 <- data.frame(
line = 1:4,
start = c(75,100,170,240),
end = c(100,150,190,300)
)
df2 <- data.frame(
v2 = c("A","B","C","D","E","F","G","H","I","J","K","F"),
start = c(0,10,30,90,120,130,154,161,175,199,205,300),
end = c(10,20,50,110,130,140,160,165,180,250,300,305)
)
df3 <- data.frame(
v3 = c("a","b","c","d","e","f"),
start = c(5,90,200,333,1000,1500),
end = c(75,171,210,400,1001,1600)
)
这里我想根据start
和[=之间的间隔将df2
和df3
加入df1
19=]。我可以做的是分步进行,即通过加入加入:
library(fuzzyjoin)
# install package "IRanges":
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("IRanges")
library(BiocManager)
# First join:
df12 <- interval_left_join(x = df1,
y = df2,
by = c("start", "end")) %>%
select(-c(start.y, end.y)) %>%
rename(start = start.x, end = end.x)
# Second join:
df123 <- interval_left_join(x = df12,
y = df3,
by = c("start", "end")) %>%
select(-c(start.y, end.y)) %>%
rename(start = start.x, end = end.x)
结果:
df123
line start end v2 v3
1 1 75 100 D a
2 1 75 100 D b
3 2 100 150 D b
4 2 100 150 E b
5 2 100 150 F b
6 3 170 190 I b
7 4 240 300 J <NA>
8 4 240 300 K <NA>
9 4 240 300 F <NA>
这一切都很好,但在我的实际数据中,我有多个数据帧要加入,然后逐个加入是不切实际且容易出错的。如何一次对所有数据帧执行连接?
将数据帧放入列表中,然后将数据帧与 Reduce
连接起来。
library(fuzzyjoin)
library(dplyr)
join_two_dataframes <- function(df1, df2) {
interval_left_join(x = df1,
y = df2,
by = c("start", "end")) %>%
select(-c(start.y, end.y)) %>%
rename(start = start.x, end = end.x)
}
list_df <- list(df1, df2, df3)
Reduce(join_two_dataframes, list_df)
# line start end v2 v3
#1 1 75 100 D a
#2 1 75 100 D b
#3 2 100 150 D b
#4 2 100 150 E b
#5 2 100 150 F b
#6 3 170 190 I b
#7 4 240 300 J <NA>
#8 4 240 300 K <NA>
#9 4 240 300 F <NA>