`interval_left_join` 多个数据帧的函数
Function to `interval_left_join` multiple dataframes
我有几个数据框想要 interval_left_join
。理论上我可以一步一步加入数据框,但更喜欢一个函数来一次性执行连接:
数据:
df1 <- data.frame(
line = 1:4,
key = c("a", "b", NA, "a"),
start = c(75,100,170,240),
end = c(100,150,190,300)
)
df2 <- data.frame(
v2 = c("A","B","C","D","E","F","G","H","I","J","K","F"),
start = c(0,10,30,90,120,130,154,161,175,199,205,300),
end = c(10,20,50,110,130,140,160,165,180,250,300,305)
)
df3 <- data.frame(
v3 = c("a","b","c","d","e","f"),
start = c(5,90,200,333,1000,1500),
end = c(75,171,210,400,1001,1600)
)
df4 <- data.frame(
v4 = c("x","y","z","xx","yy","zz"),
start = c(55,90,200,333,1000,1500),
end = c(1005,171,210,400,1001,1600)
)
我想根据它们的 start
到 end
区间汇入 df1
的变量是 v2
、v3
、v4
.到目前为止我尝试过的是下面的代码:它输出 v2
的 partyl 不正确数据并且 v3
和 v4
完全失败 -这里缺少什么或错了什么?
# install package "IRanges":
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("IRanges")
library(BiocManager)
library(fuzzyjoin)
library(data.table)
library(dplyr)
join_dataframes <- function(df1, df2) {
interval_left_join(x = df1,
y = df2,
by = c("start", "end")) %>%
group_by(grp = rleid(key)) %>%
summarise(across(c(line, start.x, end.x), first),
key = unique(key),
v2 = str_c(if_else(!is.na(v2), v2, "*" ), collapse = ",")) %>%
rename(start = start.x, end = end.x) %>%
select(-grp)
}
list_df <- list(df1, df2, df3, df4)
Reduce(join_dataframes, list_df)
想要的结果是这样的:
# A tibble: 4 x 7
line key v2 v3 start end v4
<int> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 1 a D a,b 75 100 x,y
2 2 b D,E,F b 100 150 x,y
3 3 NA I b 170 190 x,y
4 4 a J,K,F * 240 300 x
仅执行 Reduce
、v2
、v3
、v4
列的联接,联接后可汇总。
library(dplyr)
library(fuzzyjoin)
library(data.table)
join_dataframes <- function(df1, df2) {
interval_left_join(x = df1,
y = df2,
by = c("start", "end")) %>%
select(-c(start.y, end.y)) %>%
rename(start = start.x, end = end.x)
}
list_df <- list(df1, df2, df3, df4)
Reduce(join_dataframes, list_df) %>%
group_by(grp = rleid(key)) %>%
summarise(across(c(line, start, end), first),
across(v2:v4, ~toString(unique(if_else(!is.na(.), ., "*")))),
key = unique(key))
# grp line start end v2 v3 v4 key
#* <int> <int> <dbl> <dbl> <chr> <chr> <chr> <chr>
#1 1 1 75 100 D a, b x, y a
#2 2 2 100 150 D, E, F b x, y b
#3 3 3 170 190 I b x, y NA
#4 4 4 240 300 J, K, F * x a
我有几个数据框想要 interval_left_join
。理论上我可以一步一步加入数据框,但更喜欢一个函数来一次性执行连接:
数据:
df1 <- data.frame(
line = 1:4,
key = c("a", "b", NA, "a"),
start = c(75,100,170,240),
end = c(100,150,190,300)
)
df2 <- data.frame(
v2 = c("A","B","C","D","E","F","G","H","I","J","K","F"),
start = c(0,10,30,90,120,130,154,161,175,199,205,300),
end = c(10,20,50,110,130,140,160,165,180,250,300,305)
)
df3 <- data.frame(
v3 = c("a","b","c","d","e","f"),
start = c(5,90,200,333,1000,1500),
end = c(75,171,210,400,1001,1600)
)
df4 <- data.frame(
v4 = c("x","y","z","xx","yy","zz"),
start = c(55,90,200,333,1000,1500),
end = c(1005,171,210,400,1001,1600)
)
我想根据它们的 start
到 end
区间汇入 df1
的变量是 v2
、v3
、v4
.到目前为止我尝试过的是下面的代码:它输出 v2
的 partyl 不正确数据并且 v3
和 v4
完全失败 -这里缺少什么或错了什么?
# install package "IRanges":
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("IRanges")
library(BiocManager)
library(fuzzyjoin)
library(data.table)
library(dplyr)
join_dataframes <- function(df1, df2) {
interval_left_join(x = df1,
y = df2,
by = c("start", "end")) %>%
group_by(grp = rleid(key)) %>%
summarise(across(c(line, start.x, end.x), first),
key = unique(key),
v2 = str_c(if_else(!is.na(v2), v2, "*" ), collapse = ",")) %>%
rename(start = start.x, end = end.x) %>%
select(-grp)
}
list_df <- list(df1, df2, df3, df4)
Reduce(join_dataframes, list_df)
想要的结果是这样的:
# A tibble: 4 x 7
line key v2 v3 start end v4
<int> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 1 a D a,b 75 100 x,y
2 2 b D,E,F b 100 150 x,y
3 3 NA I b 170 190 x,y
4 4 a J,K,F * 240 300 x
仅执行 Reduce
、v2
、v3
、v4
列的联接,联接后可汇总。
library(dplyr)
library(fuzzyjoin)
library(data.table)
join_dataframes <- function(df1, df2) {
interval_left_join(x = df1,
y = df2,
by = c("start", "end")) %>%
select(-c(start.y, end.y)) %>%
rename(start = start.x, end = end.x)
}
list_df <- list(df1, df2, df3, df4)
Reduce(join_dataframes, list_df) %>%
group_by(grp = rleid(key)) %>%
summarise(across(c(line, start, end), first),
across(v2:v4, ~toString(unique(if_else(!is.na(.), ., "*")))),
key = unique(key))
# grp line start end v2 v3 v4 key
#* <int> <int> <dbl> <dbl> <chr> <chr> <chr> <chr>
#1 1 1 75 100 D a, b x, y a
#2 2 2 100 150 D, E, F b x, y b
#3 3 3 170 190 I b x, y NA
#4 4 4 240 300 J, K, F * x a