使用 GenomicRanges 的唯一坐标
Unique coordinates using GenomicRanges
我如何通过使用 GenomicRanges 比较两个数据集来找到独特的(不重叠的基因组坐标)?
数据集 1 =
chr start end CNA
1 170900001 171500001 loss
1 11840001 19420001 loss
1 60300001 62700001 gain
1 25520001 25820001 gain
数据集2 =
chr start end CNA
1 170940001 171500001 gain
1 60300001 62700001 gain
1 25520001 25840001 gain
1 119860001 123040001 loss
1 171500001 171580001 gain
1 79240001 84420001 gain
预期输出
chr start end CNA
1 170940001 171500001 gain
1 119860001 123040001 loss
1 171500001 171580001 gain
1 79240001 84420001 gain
试试这个:
require("GenomicRanges" )
#data
x1 <- read.table(text="chr start end CNA
1 170900001 171500001 loss
1 11840001 19420001 loss
1 60300001 62700001 gain
1 25520001 25820001 gain",header=TRUE)
x2 <- read.table(text="chr start end CNA
1 170940001 171500001 loss
1 60300001 62700001 gain
1 25520001 25840001 gain
1 119860001 123040001 loss
1 171500001 171580001 gain
1 79240001 84420001 gain",header=TRUE)
g1 <- GRanges(seqnames=paste0("chr",x1$chr),
IRanges(start=x1$start,
end=x1$end)
)
g2 <- GRanges(seqnames=paste0("chr",x2$chr),
IRanges(start=x2$start,
end=x2$end)
)
#result
setdiff(g1,g2)
#output
# GRanges object with 2 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 [ 11840001, 19420001] *
# [2] chr1 [170900001, 170940000] *
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
编辑:
基于 CNA 和 setdiff 的子集,然后合并 - c()
。虽然,与预期输出不匹配...
#result
g1_loss <- setdiff(g1[g1@elementMetadata$CNA=="loss"],
g2[g2@elementMetadata$CNA=="loss"])
g1_loss@elementMetadata$CNA <- "loss"
g2_loss <- setdiff(g2[g2@elementMetadata$CNA=="loss"],
g1[g1@elementMetadata$CNA=="loss"])
g2_loss@elementMetadata$CNA <- "loss"
g1_gain <- setdiff(g1[g1@elementMetadata$CNA=="gain"],
g2[g2@elementMetadata$CNA=="gain"])
g1_gain@elementMetadata$CNA <- "gain"
g2_gain <- setdiff(g2[g2@elementMetadata$CNA=="gain"],
g1[g1@elementMetadata$CNA=="gain"])
g2_gain@elementMetadata$CNA <- "gain"
#merge
c(g1_gain,g2_gain,g1_loss,g1_gain)
#output
# GRanges object with 5 ranges and 1 metadata column:
# seqnames ranges strand | CNA
# <Rle> <IRanges> <Rle> | <character>
# [1] chr1 [ 25820002, 25840001] * | gain
# [2] chr1 [ 79240001, 84420001] * | gain
# [3] chr1 [170940001, 171580001] * | gain
# [4] chr1 [ 11840001, 19420001] * | loss
# [5] chr1 [170900001, 171500001] * | loss
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
我如何通过使用 GenomicRanges 比较两个数据集来找到独特的(不重叠的基因组坐标)?
数据集 1 =
chr start end CNA
1 170900001 171500001 loss
1 11840001 19420001 loss
1 60300001 62700001 gain
1 25520001 25820001 gain
数据集2 =
chr start end CNA
1 170940001 171500001 gain
1 60300001 62700001 gain
1 25520001 25840001 gain
1 119860001 123040001 loss
1 171500001 171580001 gain
1 79240001 84420001 gain
预期输出
chr start end CNA
1 170940001 171500001 gain
1 119860001 123040001 loss
1 171500001 171580001 gain
1 79240001 84420001 gain
试试这个:
require("GenomicRanges" )
#data
x1 <- read.table(text="chr start end CNA
1 170900001 171500001 loss
1 11840001 19420001 loss
1 60300001 62700001 gain
1 25520001 25820001 gain",header=TRUE)
x2 <- read.table(text="chr start end CNA
1 170940001 171500001 loss
1 60300001 62700001 gain
1 25520001 25840001 gain
1 119860001 123040001 loss
1 171500001 171580001 gain
1 79240001 84420001 gain",header=TRUE)
g1 <- GRanges(seqnames=paste0("chr",x1$chr),
IRanges(start=x1$start,
end=x1$end)
)
g2 <- GRanges(seqnames=paste0("chr",x2$chr),
IRanges(start=x2$start,
end=x2$end)
)
#result
setdiff(g1,g2)
#output
# GRanges object with 2 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 [ 11840001, 19420001] *
# [2] chr1 [170900001, 170940000] *
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
编辑:
基于 CNA 和 setdiff 的子集,然后合并 - c()
。虽然,与预期输出不匹配...
#result
g1_loss <- setdiff(g1[g1@elementMetadata$CNA=="loss"],
g2[g2@elementMetadata$CNA=="loss"])
g1_loss@elementMetadata$CNA <- "loss"
g2_loss <- setdiff(g2[g2@elementMetadata$CNA=="loss"],
g1[g1@elementMetadata$CNA=="loss"])
g2_loss@elementMetadata$CNA <- "loss"
g1_gain <- setdiff(g1[g1@elementMetadata$CNA=="gain"],
g2[g2@elementMetadata$CNA=="gain"])
g1_gain@elementMetadata$CNA <- "gain"
g2_gain <- setdiff(g2[g2@elementMetadata$CNA=="gain"],
g1[g1@elementMetadata$CNA=="gain"])
g2_gain@elementMetadata$CNA <- "gain"
#merge
c(g1_gain,g2_gain,g1_loss,g1_gain)
#output
# GRanges object with 5 ranges and 1 metadata column:
# seqnames ranges strand | CNA
# <Rle> <IRanges> <Rle> | <character>
# [1] chr1 [ 25820002, 25840001] * | gain
# [2] chr1 [ 79240001, 84420001] * | gain
# [3] chr1 [170940001, 171580001] * | gain
# [4] chr1 [ 11840001, 19420001] * | loss
# [5] chr1 [170900001, 171500001] * | loss
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths