快速查找线性间隔重叠的方法
Fast way to find overlaps in linear intervals
我有一个 data.frame
的线性间隔(映射的 RNA-seq 读数的基因组坐标),例如:
df <- data.frame(seqnames = c(rep("chr10",2),rep("chr5",8)),
start = c(12255935,12257004,12243635,12244009,12253879,12254395,12254506,12255142,12255229,12258719),
end = c(12257002,12258512,12243764,12244291,12254107,12254501,12254515,12255535,12255312,12258764),
read_id = c(rep("R9",2),rep("R10",8)),
stringsAsFactors = F)
对于某些读段,存在包含在同一读段的其他段中或与其他段相交的段,我想合并它们。在上面 read_id = "R10"
的示例中,区间:chr5 12255229 12255312
包含在区间 chr5 12255142 12255535
.
中
对于单个读取 data.frame
,我使用此过程:
#defining helper functions
clusterHits <- function(overlap.hits)
{
overlap.hits <- GenomicRanges::union(overlap.hits,t(overlap.hits))
query.hits <- S4Vectors::queryHits(overlap.hits)
search.hits <- S4Vectors::subjectHits(overlap.hits)
cluster.ids <- seq_len(S4Vectors::queryLength(overlap.hits))
while(TRUE){
hit <- S4Vectors::Hits(query.hits,cluster.ids[search.hits],S4Vectors::queryLength(overlap.hits),S4Vectors::subjectLength(overlap.hits))
tmp.cluster.ids <- pmin(cluster.ids,S4Vectors::selectHits(hit,"first"))
if(identical(tmp.cluster.ids,cluster.ids))
break
cluster.ids <- tmp.cluster.ids
}
unname(S4Vectors::splitAsList(seq_len(S4Vectors::queryLength(overlap.hits)),cluster.ids))
}
mergeConnectedRanges <- function(x.gr,overlap.hits)
{
cluster.ids <- clusterHits(overlap.hits)
merged.gr <- range(IRanges::extractList(x.gr,cluster.ids))
merged.gr <- unlist(merged.gr)
S4Vectors::mcols(merged.gr)$merged.idx <- cluster.ids
return(merged.gr)
}
#Now separate R10 and merge its intervals
df1 <- dplyr::filter(df, read_id == "R10")
gr <- GenomicRanges::GRanges(dplyr::select(df1,seqnames,start,end))
redundant.intervals <- GenomicRanges::findOverlaps(gr,ignore.strand=T)
query.gr <- redundant.intervals[S4Vectors::queryHits(redundant.intervals)]
subject.gr <- redundant.intervals[S4Vectors::subjectHits(redundant.intervals)]
as.data.frame(mergeConnectedRanges(x.gr=gr,overlap.hits=redundant.intervals))
给出:
seqnames start end width strand merged.idx
1 chr5 12243635 12243764 130 * 1
2 chr5 12244009 12244291 283 * 2
3 chr5 12253879 12254107 229 * 3
4 chr5 12254395 12254501 107 * 4
5 chr5 12254506 12254515 10 * 5
6 chr5 12255142 12255535 394 * 6, 7
7 chr5 12258719 12258764 46 * 8
因此 merged.idx
显示 df1
中的区间 6 和 7 已合并。
我正在寻找一种跨数千次读取的快速方法。显而易见的方法是在 df
:
中的唯一读取中使用 do.call
library(dplyr)
do.call(rbind, lapply(unique(df$read_id), function(r){
read.df <- dplyr::filter(df, read_id == r)
gr <- GenomicRanges::GRanges(dplyr::select(read.df,seqnames,start,end))
redundant.intervals <- GenomicRanges::findOverlaps(gr,ignore.strand=T)
query.gr <- redundant.intervals[S4Vectors::queryHits(redundant.intervals)]
subject.gr <- redundant.intervals[S4Vectors::subjectHits(redundant.intervals)]
as.data.frame(mergeConnectedRanges(x.gr=gr,overlap.hits=redundant.intervals)) %>%
dplyr::mutate(read_id = r)
}))
但我想知道是否有更快的方法。请注意,实际具有此类相交间隔的读取部分相对较小。
使用 Bioconductor 存储库中的 GenomicRanges
包,只需几行代码即可完成任务:
library(GenomicRanges)
makeGRangesListFromDataFrame(df, split.field = "read_id") |>
reduce(with.revmap = TRUE) |>
as.data.frame()
group group_name seqnames start end width strand revmap
1 1 R10 chr5 12243635 12243764 130 * 1
2 1 R10 chr5 12244009 12244291 283 * 2
3 1 R10 chr5 12253879 12254107 229 * 3
4 1 R10 chr5 12254395 12254501 107 * 4
5 1 R10 chr5 12254506 12254515 10 * 5
6 1 R10 chr5 12255142 12255535 394 * 6, 7
7 1 R10 chr5 12258719 12258764 46 * 8
8 2 R9 chr10 12255935 12257002 1068 * 1
9 2 R9 chr10 12257004 12258512 1509 * 2
由于 GenomeRanges
包不在 CRAN 上,请参阅插图 Installing and Managing Bioconductor Packages 或 运行
install.packages("BiocManager")
BiocManager::install("GenomicRanges")
数据
df <- data.frame(seqnames = c(rep("chr10", 2), rep("chr5", 8)),
start = c(12255935, 12257004, 12243635, 12244009, 12253879, 12254395, 12254506, 12255142, 12255229, 12258719),
end = c(12257002, 12258512, 12243764, 12244291, 12254107, 12254501, 12254515, 12255535, 12255312, 12258764),
read_id = c(rep("R9", 2), rep("R10", 8)),
stringsAsFactors = FALSE)
作为
我有一个 data.frame
的线性间隔(映射的 RNA-seq 读数的基因组坐标),例如:
df <- data.frame(seqnames = c(rep("chr10",2),rep("chr5",8)),
start = c(12255935,12257004,12243635,12244009,12253879,12254395,12254506,12255142,12255229,12258719),
end = c(12257002,12258512,12243764,12244291,12254107,12254501,12254515,12255535,12255312,12258764),
read_id = c(rep("R9",2),rep("R10",8)),
stringsAsFactors = F)
对于某些读段,存在包含在同一读段的其他段中或与其他段相交的段,我想合并它们。在上面 read_id = "R10"
的示例中,区间:chr5 12255229 12255312
包含在区间 chr5 12255142 12255535
.
对于单个读取 data.frame
,我使用此过程:
#defining helper functions
clusterHits <- function(overlap.hits)
{
overlap.hits <- GenomicRanges::union(overlap.hits,t(overlap.hits))
query.hits <- S4Vectors::queryHits(overlap.hits)
search.hits <- S4Vectors::subjectHits(overlap.hits)
cluster.ids <- seq_len(S4Vectors::queryLength(overlap.hits))
while(TRUE){
hit <- S4Vectors::Hits(query.hits,cluster.ids[search.hits],S4Vectors::queryLength(overlap.hits),S4Vectors::subjectLength(overlap.hits))
tmp.cluster.ids <- pmin(cluster.ids,S4Vectors::selectHits(hit,"first"))
if(identical(tmp.cluster.ids,cluster.ids))
break
cluster.ids <- tmp.cluster.ids
}
unname(S4Vectors::splitAsList(seq_len(S4Vectors::queryLength(overlap.hits)),cluster.ids))
}
mergeConnectedRanges <- function(x.gr,overlap.hits)
{
cluster.ids <- clusterHits(overlap.hits)
merged.gr <- range(IRanges::extractList(x.gr,cluster.ids))
merged.gr <- unlist(merged.gr)
S4Vectors::mcols(merged.gr)$merged.idx <- cluster.ids
return(merged.gr)
}
#Now separate R10 and merge its intervals
df1 <- dplyr::filter(df, read_id == "R10")
gr <- GenomicRanges::GRanges(dplyr::select(df1,seqnames,start,end))
redundant.intervals <- GenomicRanges::findOverlaps(gr,ignore.strand=T)
query.gr <- redundant.intervals[S4Vectors::queryHits(redundant.intervals)]
subject.gr <- redundant.intervals[S4Vectors::subjectHits(redundant.intervals)]
as.data.frame(mergeConnectedRanges(x.gr=gr,overlap.hits=redundant.intervals))
给出:
seqnames start end width strand merged.idx
1 chr5 12243635 12243764 130 * 1
2 chr5 12244009 12244291 283 * 2
3 chr5 12253879 12254107 229 * 3
4 chr5 12254395 12254501 107 * 4
5 chr5 12254506 12254515 10 * 5
6 chr5 12255142 12255535 394 * 6, 7
7 chr5 12258719 12258764 46 * 8
因此 merged.idx
显示 df1
中的区间 6 和 7 已合并。
我正在寻找一种跨数千次读取的快速方法。显而易见的方法是在 df
:
do.call
library(dplyr)
do.call(rbind, lapply(unique(df$read_id), function(r){
read.df <- dplyr::filter(df, read_id == r)
gr <- GenomicRanges::GRanges(dplyr::select(read.df,seqnames,start,end))
redundant.intervals <- GenomicRanges::findOverlaps(gr,ignore.strand=T)
query.gr <- redundant.intervals[S4Vectors::queryHits(redundant.intervals)]
subject.gr <- redundant.intervals[S4Vectors::subjectHits(redundant.intervals)]
as.data.frame(mergeConnectedRanges(x.gr=gr,overlap.hits=redundant.intervals)) %>%
dplyr::mutate(read_id = r)
}))
但我想知道是否有更快的方法。请注意,实际具有此类相交间隔的读取部分相对较小。
使用 Bioconductor 存储库中的 GenomicRanges
包,只需几行代码即可完成任务:
library(GenomicRanges)
makeGRangesListFromDataFrame(df, split.field = "read_id") |>
reduce(with.revmap = TRUE) |>
as.data.frame()
group group_name seqnames start end width strand revmap 1 1 R10 chr5 12243635 12243764 130 * 1 2 1 R10 chr5 12244009 12244291 283 * 2 3 1 R10 chr5 12253879 12254107 229 * 3 4 1 R10 chr5 12254395 12254501 107 * 4 5 1 R10 chr5 12254506 12254515 10 * 5 6 1 R10 chr5 12255142 12255535 394 * 6, 7 7 1 R10 chr5 12258719 12258764 46 * 8 8 2 R9 chr10 12255935 12257002 1068 * 1 9 2 R9 chr10 12257004 12258512 1509 * 2
由于 GenomeRanges
包不在 CRAN 上,请参阅插图 Installing and Managing Bioconductor Packages 或 运行
install.packages("BiocManager")
BiocManager::install("GenomicRanges")
数据
df <- data.frame(seqnames = c(rep("chr10", 2), rep("chr5", 8)),
start = c(12255935, 12257004, 12243635, 12244009, 12253879, 12254395, 12254506, 12255142, 12255229, 12258719),
end = c(12257002, 12258512, 12243764, 12244291, 12254107, 12254501, 12254515, 12255535, 12255312, 12258764),
read_id = c(rep("R9", 2), rep("R10", 8)),
stringsAsFactors = FALSE)
作为