查找点到范围的重叠
Find point-to-range overlaps
我有一个数据框 df1:
df1 <- read.table(text=" Chr06 79641
Chr06 82862
Chr06 387314
Chr06 656098
Chr06 678491
Chr06 1018696", header=FALSE, stringsAsFactors=FALSE)
我想检查 df1 中的每一行是否包含在 df2 的范围内。 df2 中的 column2 是范围的开始,column3 是范围的结束。范围之间(行之间)没有重叠。 df2 中的数据按 Column1 和 column2 排序。我为此写了一个循环,但我对此并不满意,因为如果我在 df1 中有几千行,它会运行很长时间。我想找到一种更有效的方法来完成这项工作(最好不要循环)。谢谢。
df2 数据框:
df2 <- read.table(text=" Chr05 799 870
Chr06 77914 77942
Chr06 78233 78269
Chr06 78719 78836
Chr06 79720 87043
Chr06 87223 87305
Chr06 380020 380060
Chr06 387314 387371
Chr06 654907 654988
Chr06 657929 658057
Chr06 677198 677229
Chr06 679555 680170
Chr06 1015425 1015475
Chr06 1018676 1018736
Chr06 1020564 1020592", header=FALSE, stringsAsFactors=FALSE)
我的脚本:
df1$V3 <- FALSE
for (i in 1:dim(df1)[1]) {
for (j in 1:dim(df2)[1]) {
if ((df1[i,1] == df2[j,1]) && (df1[i,2]>=df2[j,2])
&& (df1[i,2]<=df2[j,3])) {
df1[i,3]<-TRUE
break;
}
}
}
df1
预期结果显示为df1。
您可以使用 sapply
:
sapply(1:nrow(df1), function(x) any(df1[x,2] >= df2$V2 &
df1[x,2] <= df2$V3 &
df1[x, 1] == df2$V1))
[1] FALSE TRUE TRUE FALSE FALSE TRUE
#Convert to Granges objects
gr1 <- GRanges(seqnames = df1$V1,
ranges = IRanges(df1$V2, df1$V2))
gr2 <- GRanges(seqnames = df2$V1,
ranges = IRanges(df2$V2, df2$V3))
#Subset gr1
subsetByOverlaps(gr1, gr2)
# GRanges object with 3 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] Chr06 [ 82862, 82862] *
# [2] Chr06 [ 387314, 387314] *
# [3] Chr06 [1018696, 1018696] *
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
#Or we can use merge
mergeByOverlaps(gr1, gr2)
# DataFrame with 3 rows and 2 columns
# gr1 gr2
# <GRanges> <GRanges>
# 1 Chr06:*:[ 82862, 82862] Chr06:*:[ 79720, 87043]
# 2 Chr06:*:[ 387314, 387314] Chr06:*:[ 387314, 387371]
# 3 Chr06:*:[1018696, 1018696] Chr06:*:[1018676, 1018736]
此外,查看 bedtools:
Collectively, the bedtools utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. The most widely-used tools enable genome arithmetic: that is, set theory on the genome. For example, bedtools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF. While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line.
这里有一个 data.table
解决方案作为 GenomicRanges
的替代方案:
library(data.table)
dt1 <- data.table(df1)[, V3 := V2]
dt2 <- data.table(df2, key = c("V2", "V3"))
foverlaps(dt1, dt2)[V1 == i.V1][, -c(4, 6), with = F]
# V1 V2 V3 i.V3
# 1: Chr06 79720 87043 82862
# 2: Chr06 387314 387371 387314
# 3: Chr06 1018676 1018736 1018696
我有一个数据框 df1:
df1 <- read.table(text=" Chr06 79641
Chr06 82862
Chr06 387314
Chr06 656098
Chr06 678491
Chr06 1018696", header=FALSE, stringsAsFactors=FALSE)
我想检查 df1 中的每一行是否包含在 df2 的范围内。 df2 中的 column2 是范围的开始,column3 是范围的结束。范围之间(行之间)没有重叠。 df2 中的数据按 Column1 和 column2 排序。我为此写了一个循环,但我对此并不满意,因为如果我在 df1 中有几千行,它会运行很长时间。我想找到一种更有效的方法来完成这项工作(最好不要循环)。谢谢。 df2 数据框:
df2 <- read.table(text=" Chr05 799 870
Chr06 77914 77942
Chr06 78233 78269
Chr06 78719 78836
Chr06 79720 87043
Chr06 87223 87305
Chr06 380020 380060
Chr06 387314 387371
Chr06 654907 654988
Chr06 657929 658057
Chr06 677198 677229
Chr06 679555 680170
Chr06 1015425 1015475
Chr06 1018676 1018736
Chr06 1020564 1020592", header=FALSE, stringsAsFactors=FALSE)
我的脚本:
df1$V3 <- FALSE
for (i in 1:dim(df1)[1]) {
for (j in 1:dim(df2)[1]) {
if ((df1[i,1] == df2[j,1]) && (df1[i,2]>=df2[j,2])
&& (df1[i,2]<=df2[j,3])) {
df1[i,3]<-TRUE
break;
}
}
}
df1
预期结果显示为df1。
您可以使用 sapply
:
sapply(1:nrow(df1), function(x) any(df1[x,2] >= df2$V2 &
df1[x,2] <= df2$V3 &
df1[x, 1] == df2$V1))
[1] FALSE TRUE TRUE FALSE FALSE TRUE
#Convert to Granges objects
gr1 <- GRanges(seqnames = df1$V1,
ranges = IRanges(df1$V2, df1$V2))
gr2 <- GRanges(seqnames = df2$V1,
ranges = IRanges(df2$V2, df2$V3))
#Subset gr1
subsetByOverlaps(gr1, gr2)
# GRanges object with 3 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] Chr06 [ 82862, 82862] *
# [2] Chr06 [ 387314, 387314] *
# [3] Chr06 [1018696, 1018696] *
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
#Or we can use merge
mergeByOverlaps(gr1, gr2)
# DataFrame with 3 rows and 2 columns
# gr1 gr2
# <GRanges> <GRanges>
# 1 Chr06:*:[ 82862, 82862] Chr06:*:[ 79720, 87043]
# 2 Chr06:*:[ 387314, 387314] Chr06:*:[ 387314, 387371]
# 3 Chr06:*:[1018696, 1018696] Chr06:*:[1018676, 1018736]
此外,查看 bedtools:
Collectively, the bedtools utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. The most widely-used tools enable genome arithmetic: that is, set theory on the genome. For example, bedtools allows one to intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF. While each individual tool is designed to do a relatively simple task (e.g., intersect two interval files), quite sophisticated analyses can be conducted by combining multiple bedtools operations on the UNIX command line.
这里有一个 data.table
解决方案作为 GenomicRanges
的替代方案:
library(data.table)
dt1 <- data.table(df1)[, V3 := V2]
dt2 <- data.table(df2, key = c("V2", "V3"))
foverlaps(dt1, dt2)[V1 == i.V1][, -c(4, 6), with = F]
# V1 V2 V3 i.V3
# 1: Chr06 79720 87043 82862
# 2: Chr06 387314 387371 387314
# 3: Chr06 1018676 1018736 1018696