使用 GenomicRanges 的唯一坐标

Unique coordinates using GenomicRanges

我如何通过使用 GenomicRanges 比较两个数据集来找到独特的(不重叠的基因组坐标)?

数据集 1 =

chr start         end       CNA
1   170900001   171500001   loss
1   11840001    19420001    loss
1   60300001    62700001    gain
1   25520001    25820001    gain

数据集2 =

chr  start       end        CNA
1   170940001   171500001   gain
1   60300001    62700001    gain
1   25520001    25840001    gain
1   119860001   123040001   loss
1   171500001   171580001   gain
1   79240001    84420001    gain

预期输出

     chr     start       end        CNA
     1     170940001 171500001  gain
     1    119860001  123040001    loss
     1    171500001  171580001    gain
     1    79240001   84420001     gain

试试这个:

require("GenomicRanges"        )

#data
x1 <- read.table(text="chr start         end       CNA
1   170900001   171500001   loss
1   11840001    19420001    loss
1   60300001    62700001    gain
1   25520001    25820001    gain",header=TRUE)

x2 <- read.table(text="chr  start       end        CNA
1   170940001   171500001   loss
1   60300001    62700001    gain
1   25520001    25840001    gain
1   119860001   123040001   loss
1   171500001   171580001   gain
1   79240001    84420001    gain",header=TRUE)

g1 <-  GRanges(seqnames=paste0("chr",x1$chr),
               IRanges(start=x1$start,
                       end=x1$end)
               )

g2 <-  GRanges(seqnames=paste0("chr",x2$chr),
               IRanges(start=x2$start,
                       end=x2$end)
               )

#result
setdiff(g1,g2)

#output
# GRanges object with 2 ranges and 0 metadata columns:
#       seqnames                 ranges strand
#          <Rle>              <IRanges>  <Rle>
#   [1]     chr1 [ 11840001,  19420001]      *
#   [2]     chr1 [170900001, 170940000]      *
#   -------
#   seqinfo: 1 sequence from an unspecified genome; no seqlengths

编辑: 基于 CNA 和 setdiff 的子集,然后合并 - c()。虽然,与预期输出不匹配...

#result
g1_loss <- setdiff(g1[g1@elementMetadata$CNA=="loss"],
                   g2[g2@elementMetadata$CNA=="loss"])
g1_loss@elementMetadata$CNA <- "loss"

g2_loss <- setdiff(g2[g2@elementMetadata$CNA=="loss"],
                   g1[g1@elementMetadata$CNA=="loss"])
g2_loss@elementMetadata$CNA <- "loss"

g1_gain <- setdiff(g1[g1@elementMetadata$CNA=="gain"],
                   g2[g2@elementMetadata$CNA=="gain"])
g1_gain@elementMetadata$CNA <- "gain"

g2_gain <- setdiff(g2[g2@elementMetadata$CNA=="gain"],
                   g1[g1@elementMetadata$CNA=="gain"])
g2_gain@elementMetadata$CNA <- "gain"

#merge
c(g1_gain,g2_gain,g1_loss,g1_gain)

#output
# GRanges object with 5 ranges and 1 metadata column:
#     seqnames                 ranges strand |         CNA
#        <Rle>              <IRanges>  <Rle> | <character>
# [1]     chr1 [ 25820002,  25840001]      * |        gain
# [2]     chr1 [ 79240001,  84420001]      * |        gain
# [3]     chr1 [170940001, 171580001]      * |        gain
# [4]     chr1 [ 11840001,  19420001]      * |        loss
# [5]     chr1 [170900001, 171500001]      * |        loss
# -------
#   seqinfo: 1 sequence from an unspecified genome; no seqlengths