如何将 data.frame 对象强制转换为 R 中的基因组范围对象?
How to coerce data.frame objects to Genomic Ranges objects in R?
我在 R 中有几个床文件作为 data.frame 个对象。现在我想按元素查找至少两个床文件之间的重叠。
为了澄清我的问题,我需要在第一床文件(已经在数据框对象中)中逐行进行测试,因此只将数据框的一行作为查询,然后将其交给区间间隔树持有第二个床文件的树(但需要先强制 GRanges 对象)。
我的数据看起来像:
idx chrom start End name score p-value
1 chr1 32727 32817 MACS_peak_1 8.69 1.150748e-11
2 chr1 52489 52552 MACS_peak_2 4.26 2.347418e-11
3 chr1 65527 65590 MACS_peak_3 4.19 2.386635e-11
4 chr1 65773 65904 MACS_peak_4 2.02 4.950495e-11
5 chr1 66001 66117 MACS_peak_5 5.66 1.766784e-11
6 chr1 115700 115769 MACS_peak_6 10.30 9.708738e-12
7 chr1 136389 136452 MACS_peak_7 4.26 2.347418e-11
8 chr1 235352 235415 MACS_peak_8 4.26 2.347418e-11
9 chr1 235636 235700 MACS_peak_9 5.66 1.766784e-11
10 chr1 432895 432958 MACS_peak_10 4.26 2.347418e-11
f1 <- function(bed.1, bed.2){
query<- GRanges()
subject = bed.2
for(i in 1: length(bed.1)){
query<-bed.1[i]
o <- GenomicRanges::findOverlaps(query, subject, minoverlap = 2L, algorithm="intervaltree")
hitfrom_<-query[queryHits(o)]
hitTo_<-subject[subjectHits(o)]
pint <-pintersect(hitfrom_, hitTo_)
return(pint)
}
}
这是我的代码,如何迭代 bed.1 中的 GRanges 对象集并调用 findOverlap() 函数来查找重叠的 GRanges 在哪里。这段代码没有给我想要的结果。有人帮帮我吗??谢谢
我猜你不需要逐行操作。
text1 <- "idx chrom start End name score p-value
1 chr1 32727 32817 MACS_peak_1 8.69 1.150748e-11
2 chr1 52489 52552 MACS_peak_2 4.26 2.347418e-11
3 chr1 65527 65590 MACS_peak_3 4.19 2.386635e-11
4 chr1 65773 65904 MACS_peak_4 2.02 4.950495e-11
5 chr1 66001 66117 MACS_peak_5 5.66 1.766784e-11
6 chr1 115700 115769 MACS_peak_6 10.30 9.708738e-12
7 chr1 136389 136452 MACS_peak_7 4.26 2.347418e-11
8 chr1 235352 235415 MACS_peak_8 4.26 2.347418e-11
9 chr1 235636 235700 MACS_peak_9 5.66 1.766784e-11
10 chr1 432895 432958 MACS_peak_10 4.26 2.347418e-11"
bed1 <- read.table(text=text1, head=T, as.is=T)
library(GenomicRanges)
bed1.gr <- GRanges(bed1$chrom, IRanges(bed1$start, bed1$End))
bed2 <- data.frame(chr=c("chr1", "chr1"),
start=c(30000, 130000),
end=c(60000, 200000), stringsAsFactors = FALSE)
bed2.gr <- GRanges(bed2$chr, IRanges(bed2$start, bed2$end))
op <- findOverlaps(bed1.gr, bed2.gr)
op.df <- data.frame(que=queryHits(op), sub=subjectHits(op),
stringsAsFactors = FALSE)
bed1$que <- 1:nrow(bed1)
bed2$sub <- 1:nrow(bed2)
bed.n <- merge(bed1, op.df, by="que", all=T)
bed.n <- merge(bed.n, bed2, by="sub", all=T)
bed.n$que <- NULL
bed.n$sub <- NULL
bed.n
# idx chrom start.x End name score p.value chr start.y end
# 1 1 chr1 32727 32817 MACS_peak_1 8.69 1.150748e-11 chr1 30000 6e+04
# 2 2 chr1 52489 52552 MACS_peak_2 4.26 2.347418e-11 chr1 30000 6e+04
# 3 7 chr1 136389 136452 MACS_peak_7 4.26 2.347418e-11 chr1 130000 2e+05
# 4 5 chr1 66001 66117 MACS_peak_5 5.66 1.766784e-11 <NA> NA NA
# 5 6 chr1 115700 115769 MACS_peak_6 10.30 9.708738e-12 <NA> NA NA
# 6 3 chr1 65527 65590 MACS_peak_3 4.19 2.386635e-11 <NA> NA NA
# 7 4 chr1 65773 65904 MACS_peak_4 2.02 4.950495e-11 <NA> NA NA
# 8 9 chr1 235636 235700 MACS_peak_9 5.66 1.766784e-11 <NA> NA NA
# 9 10 chr1 432895 432958 MACS_peak_10 4.26 2.347418e-11 <NA> NA NA
# 10 8 chr1 235352 235415 MACS_peak_8 4.26 2.347418e-11 <NA> NA NA
有一个名为 makeGRangesFromDataFrame
的函数就是为了这个目的:
makeGRangesFromDataFrame(df,
keep.extra.columns=FALSE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field=c("seqnames", "seqname",
"chromosome", "chrom",
"chr", "chromosome_name",
"seqid"),
start.field="start",
end.field=c("end", "stop"),
strand.field="strand",
starts.in.df.are.0based=FALSE)
如果您的起始列不是上面的默认名称,则必须使用 start.field="Start"
或您的列的任何名称来调用它。 end.field
、strand.field
等
相同
让我们考虑以下可重现的示例:
a <- GRanges(
seqnames=Rle(c("chr1", "chr2", "chr3", "chr4"), c(3, 2, 1, 2)),
ranges=IRanges(seq(1, by=9, len=8), seq(7, by=9, len=8)),
rangeName=letters[seq(1:8)], score=sample(1:20, 8, replace = FALSE))
b <- GRanges(
seqnames=Rle(c("chr1", "chr2", "chr3","chr4"), c(4, 3, 1, 1)),
ranges=IRanges(seq(2, by=5, len=9), seq(4, by=5, len=9)),
rangeName=letters[seq(1:9)], score=sample(1:20, 9, replace = FALSE))
然后,两个 GRanges 对象按元素重叠:
ov <- as(findOverlaps(a,b), "List")
ov
returns 将命中索引向量重叠为 compressedIntegerlist 对象。
假设 bed.1
和 bed.2
都是数据帧,那么这将有效:
library(GenomicRanges)
colnames(bed.1)[c(2,4)] <- c("seqnames", "end") #GRanges insists on these column names
colnames(bed.2) <- colnames(bed.1)
gr1 <- GRanges(bed.1) #will work with the right column names
gr2 <- GRanges(bed.2)
#if you want the set of ranges overlapped by at least one range from each:
int <- intersect(gr1, gr2) #GRanges output
df.int = as.data.frame(int) #data frame of new ranges
另一种方法是直接从床文件本身开始,使用rtracklayer的导入功能将它们直接导入到一个GRanges对象中:
source("https://bioconductor.org/biocLite.R")
biocLite("rtracklayer")
library(rtracklayer)
import("example.bed", format="bed")
这为您提供了 GRanges 对象,而无需通过中间数据框。
我在 R 中有几个床文件作为 data.frame 个对象。现在我想按元素查找至少两个床文件之间的重叠。
为了澄清我的问题,我需要在第一床文件(已经在数据框对象中)中逐行进行测试,因此只将数据框的一行作为查询,然后将其交给区间间隔树持有第二个床文件的树(但需要先强制 GRanges 对象)。
我的数据看起来像:
idx chrom start End name score p-value
1 chr1 32727 32817 MACS_peak_1 8.69 1.150748e-11
2 chr1 52489 52552 MACS_peak_2 4.26 2.347418e-11
3 chr1 65527 65590 MACS_peak_3 4.19 2.386635e-11
4 chr1 65773 65904 MACS_peak_4 2.02 4.950495e-11
5 chr1 66001 66117 MACS_peak_5 5.66 1.766784e-11
6 chr1 115700 115769 MACS_peak_6 10.30 9.708738e-12
7 chr1 136389 136452 MACS_peak_7 4.26 2.347418e-11
8 chr1 235352 235415 MACS_peak_8 4.26 2.347418e-11
9 chr1 235636 235700 MACS_peak_9 5.66 1.766784e-11
10 chr1 432895 432958 MACS_peak_10 4.26 2.347418e-11
f1 <- function(bed.1, bed.2){
query<- GRanges()
subject = bed.2
for(i in 1: length(bed.1)){
query<-bed.1[i]
o <- GenomicRanges::findOverlaps(query, subject, minoverlap = 2L, algorithm="intervaltree")
hitfrom_<-query[queryHits(o)]
hitTo_<-subject[subjectHits(o)]
pint <-pintersect(hitfrom_, hitTo_)
return(pint)
}
}
这是我的代码,如何迭代 bed.1 中的 GRanges 对象集并调用 findOverlap() 函数来查找重叠的 GRanges 在哪里。这段代码没有给我想要的结果。有人帮帮我吗??谢谢
我猜你不需要逐行操作。
text1 <- "idx chrom start End name score p-value
1 chr1 32727 32817 MACS_peak_1 8.69 1.150748e-11
2 chr1 52489 52552 MACS_peak_2 4.26 2.347418e-11
3 chr1 65527 65590 MACS_peak_3 4.19 2.386635e-11
4 chr1 65773 65904 MACS_peak_4 2.02 4.950495e-11
5 chr1 66001 66117 MACS_peak_5 5.66 1.766784e-11
6 chr1 115700 115769 MACS_peak_6 10.30 9.708738e-12
7 chr1 136389 136452 MACS_peak_7 4.26 2.347418e-11
8 chr1 235352 235415 MACS_peak_8 4.26 2.347418e-11
9 chr1 235636 235700 MACS_peak_9 5.66 1.766784e-11
10 chr1 432895 432958 MACS_peak_10 4.26 2.347418e-11"
bed1 <- read.table(text=text1, head=T, as.is=T)
library(GenomicRanges)
bed1.gr <- GRanges(bed1$chrom, IRanges(bed1$start, bed1$End))
bed2 <- data.frame(chr=c("chr1", "chr1"),
start=c(30000, 130000),
end=c(60000, 200000), stringsAsFactors = FALSE)
bed2.gr <- GRanges(bed2$chr, IRanges(bed2$start, bed2$end))
op <- findOverlaps(bed1.gr, bed2.gr)
op.df <- data.frame(que=queryHits(op), sub=subjectHits(op),
stringsAsFactors = FALSE)
bed1$que <- 1:nrow(bed1)
bed2$sub <- 1:nrow(bed2)
bed.n <- merge(bed1, op.df, by="que", all=T)
bed.n <- merge(bed.n, bed2, by="sub", all=T)
bed.n$que <- NULL
bed.n$sub <- NULL
bed.n
# idx chrom start.x End name score p.value chr start.y end
# 1 1 chr1 32727 32817 MACS_peak_1 8.69 1.150748e-11 chr1 30000 6e+04
# 2 2 chr1 52489 52552 MACS_peak_2 4.26 2.347418e-11 chr1 30000 6e+04
# 3 7 chr1 136389 136452 MACS_peak_7 4.26 2.347418e-11 chr1 130000 2e+05
# 4 5 chr1 66001 66117 MACS_peak_5 5.66 1.766784e-11 <NA> NA NA
# 5 6 chr1 115700 115769 MACS_peak_6 10.30 9.708738e-12 <NA> NA NA
# 6 3 chr1 65527 65590 MACS_peak_3 4.19 2.386635e-11 <NA> NA NA
# 7 4 chr1 65773 65904 MACS_peak_4 2.02 4.950495e-11 <NA> NA NA
# 8 9 chr1 235636 235700 MACS_peak_9 5.66 1.766784e-11 <NA> NA NA
# 9 10 chr1 432895 432958 MACS_peak_10 4.26 2.347418e-11 <NA> NA NA
# 10 8 chr1 235352 235415 MACS_peak_8 4.26 2.347418e-11 <NA> NA NA
有一个名为 makeGRangesFromDataFrame
的函数就是为了这个目的:
makeGRangesFromDataFrame(df,
keep.extra.columns=FALSE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field=c("seqnames", "seqname",
"chromosome", "chrom",
"chr", "chromosome_name",
"seqid"),
start.field="start",
end.field=c("end", "stop"),
strand.field="strand",
starts.in.df.are.0based=FALSE)
如果您的起始列不是上面的默认名称,则必须使用 start.field="Start"
或您的列的任何名称来调用它。 end.field
、strand.field
等
让我们考虑以下可重现的示例:
a <- GRanges(
seqnames=Rle(c("chr1", "chr2", "chr3", "chr4"), c(3, 2, 1, 2)),
ranges=IRanges(seq(1, by=9, len=8), seq(7, by=9, len=8)),
rangeName=letters[seq(1:8)], score=sample(1:20, 8, replace = FALSE))
b <- GRanges(
seqnames=Rle(c("chr1", "chr2", "chr3","chr4"), c(4, 3, 1, 1)),
ranges=IRanges(seq(2, by=5, len=9), seq(4, by=5, len=9)),
rangeName=letters[seq(1:9)], score=sample(1:20, 9, replace = FALSE))
然后,两个 GRanges 对象按元素重叠:
ov <- as(findOverlaps(a,b), "List")
ov
returns 将命中索引向量重叠为 compressedIntegerlist 对象。
假设 bed.1
和 bed.2
都是数据帧,那么这将有效:
library(GenomicRanges)
colnames(bed.1)[c(2,4)] <- c("seqnames", "end") #GRanges insists on these column names
colnames(bed.2) <- colnames(bed.1)
gr1 <- GRanges(bed.1) #will work with the right column names
gr2 <- GRanges(bed.2)
#if you want the set of ranges overlapped by at least one range from each:
int <- intersect(gr1, gr2) #GRanges output
df.int = as.data.frame(int) #data frame of new ranges
另一种方法是直接从床文件本身开始,使用rtracklayer的导入功能将它们直接导入到一个GRanges对象中:
source("https://bioconductor.org/biocLite.R")
biocLite("rtracklayer")
library(rtracklayer)
import("example.bed", format="bed")
这为您提供了 GRanges 对象,而无需通过中间数据框。