如何根据R中的重叠序列有效地合并两个数据表?
How to efficiently merge two data tables based on overlapping sequences in R?
我有两个数据表,例如:
dt1 <- data.table(ID1 = c("A", "B", "C", "D", "E"),
start1 = c(100, 1, 210, 300, 400),
end1 = c(200, 90, 240, 380, 500))
dt2 <- data.table(ID2 = c("a1", "a2", "a3", "a4", "a5", "a6"),
start2 = c(10, 150, 300, 310, 350, 400),
end2 = c(50, 100, 250, 280, 390, 450))
我正在尝试根据它们是否具有重叠序列来合并它们。例如,期望的输出是:
output <- data.table(ID1 = c("A", "B", "D", "D", "D", "E"),
start1 = c(100, 1, 300, 300, 300, 400),
end1 = c(200, 90, 380, 380, 380, 500),
ID2 = c("a2", "a1", "a3", "a4", "a5", "a6"),
start2 = c(150, 10, 300, 310, 350, 400),
end2 = c(100, 50, 250, 280, 390, 450))
我可以在 for 循环中执行此操作。例如:
ID1_list <- list() # set output lists
ID2_list <- list()
for (i in 1:nrow(dt1)){
vec1 <- seq(from = dt1$start1[i], to = dt1$end1[i])
ID1_vec <- rep(dt1$ID1, each = nrow(dt2)) # set output vectors
ID2_vec <- rep(NA, nrow(dt2))
for (j in 1:nrow(dt2)){
vec2 <- seq(from = dt2$start[j], to = dt2$end[j])
if (length(intersect(vec2, vec1)) > 0){
ID2_vec[j] <- dt2$ID2[j]
}
}
ID1_list[[i]] <- ID1_vec
ID2_list[[i]] <- ID2_vec
}
output2 <- data.table(ID1 = unlist(ID1_list),
ID2 = unlist(ID2_list))
output2 <- output2[complete.cases(output2),]
output2 <- merge(dt1, unique(output2))
output2 <- merge(output2, dt2, by = "ID2")
但是,我应用这个的数据表非常大,而且这个方法太慢了。有没有人对我如何提高性能有任何建议?
使用data.table::foverlaps()
的解决方案..
foverlaps()
-样本数据的函数错误,因为 dt2
中的某些行中有 end < start
。所以我对你的样本做了一些改动(从头到尾切换,然后 v.v.)。
library(data.table)
#in foverlaps(), start should always be before end..
#so switch dt2's values where this is not the case
dt2[ start2 > end2, `:=`( start2 = end2, end2 = start2)]
setkey(dt1, start1, end1)
setkey(dt2, start2, end2)
foverlaps( dt2, dt1 )
# ID1 start1 end1 ID2 start2 end2
# 1: B 1 90 a1 10 50
# 2: A 100 200 a2 100 150
# 3: D 300 380 a3 250 300
# 4: D 300 380 a4 280 310
# 5: D 300 380 a5 350 390
# 6: E 400 500 a6 400 450
更新
ans <- foverlaps( dt2, dt1 )
library( matrixStats )
ans[, overlap_start := rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = c("start1", "start2")]
ans[, overlap_end := rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = c("end1", "end2")]
ans[, overlap_size := overlap_end - overlap_start + 1 ]
# ID1 start1 end1 ID2 start2 end2 overlap_start overlap_end overlap_size
# 1: B 1 90 a1 10 50 10 50 41
# 2: A 100 200 a2 100 150 100 150 51
# 3: D 300 380 a3 250 300 300 300 1
# 4: D 300 380 a4 280 310 300 310 11
# 5: D 300 380 a5 350 390 350 380 31
# 6: E 400 500 a6 400 450 400 450 51
我有两个数据表,例如:
dt1 <- data.table(ID1 = c("A", "B", "C", "D", "E"),
start1 = c(100, 1, 210, 300, 400),
end1 = c(200, 90, 240, 380, 500))
dt2 <- data.table(ID2 = c("a1", "a2", "a3", "a4", "a5", "a6"),
start2 = c(10, 150, 300, 310, 350, 400),
end2 = c(50, 100, 250, 280, 390, 450))
我正在尝试根据它们是否具有重叠序列来合并它们。例如,期望的输出是:
output <- data.table(ID1 = c("A", "B", "D", "D", "D", "E"),
start1 = c(100, 1, 300, 300, 300, 400),
end1 = c(200, 90, 380, 380, 380, 500),
ID2 = c("a2", "a1", "a3", "a4", "a5", "a6"),
start2 = c(150, 10, 300, 310, 350, 400),
end2 = c(100, 50, 250, 280, 390, 450))
我可以在 for 循环中执行此操作。例如:
ID1_list <- list() # set output lists
ID2_list <- list()
for (i in 1:nrow(dt1)){
vec1 <- seq(from = dt1$start1[i], to = dt1$end1[i])
ID1_vec <- rep(dt1$ID1, each = nrow(dt2)) # set output vectors
ID2_vec <- rep(NA, nrow(dt2))
for (j in 1:nrow(dt2)){
vec2 <- seq(from = dt2$start[j], to = dt2$end[j])
if (length(intersect(vec2, vec1)) > 0){
ID2_vec[j] <- dt2$ID2[j]
}
}
ID1_list[[i]] <- ID1_vec
ID2_list[[i]] <- ID2_vec
}
output2 <- data.table(ID1 = unlist(ID1_list),
ID2 = unlist(ID2_list))
output2 <- output2[complete.cases(output2),]
output2 <- merge(dt1, unique(output2))
output2 <- merge(output2, dt2, by = "ID2")
但是,我应用这个的数据表非常大,而且这个方法太慢了。有没有人对我如何提高性能有任何建议?
使用data.table::foverlaps()
的解决方案..
foverlaps()
-样本数据的函数错误,因为 dt2
中的某些行中有 end < start
。所以我对你的样本做了一些改动(从头到尾切换,然后 v.v.)。
library(data.table)
#in foverlaps(), start should always be before end..
#so switch dt2's values where this is not the case
dt2[ start2 > end2, `:=`( start2 = end2, end2 = start2)]
setkey(dt1, start1, end1)
setkey(dt2, start2, end2)
foverlaps( dt2, dt1 )
# ID1 start1 end1 ID2 start2 end2
# 1: B 1 90 a1 10 50
# 2: A 100 200 a2 100 150
# 3: D 300 380 a3 250 300
# 4: D 300 380 a4 280 310
# 5: D 300 380 a5 350 390
# 6: E 400 500 a6 400 450
更新
ans <- foverlaps( dt2, dt1 )
library( matrixStats )
ans[, overlap_start := rowMaxs( as.matrix(.SD), na.rm = TRUE ), .SDcols = c("start1", "start2")]
ans[, overlap_end := rowMins( as.matrix(.SD), na.rm = TRUE ), .SDcols = c("end1", "end2")]
ans[, overlap_size := overlap_end - overlap_start + 1 ]
# ID1 start1 end1 ID2 start2 end2 overlap_start overlap_end overlap_size
# 1: B 1 90 a1 10 50 10 50 41
# 2: A 100 200 a2 100 150 100 150 51
# 3: D 300 380 a3 250 300 300 300 1
# 4: D 300 380 a4 280 310 300 310 11
# 5: D 300 380 a5 350 390 350 380 31
# 6: E 400 500 a6 400 450 400 450 51