分别获取密码子内同义和非同义核苷酸位置的范围

Get ranges for synonymous and non-synonymous nucleotide positions within a codon separately

我有GRanges对象(所有基因外显子的坐标); coding_pos 定义特定外显子中密码子的起始位置(1 表示外显子中的第一个核苷酸也是密码子中的第一个核苷酸,依此类推)。

grTargetGene 本身看起来像这样

> grTargetGene

GRanges object with 11 ranges and 7 metadata columns:
   seqnames                 ranges strand |     ensembl_ids   gene_biotype prev_exons_length coding_pos
      <Rle>              <IRanges>  <Rle> |     <character>    <character>         <numeric>  <numeric>
   [1]     chr2 [148602722, 148602776]      + | ENSG00000121989 protein_coding       0           1
   [2]     chr2 [148653870, 148654077]      + | ENSG00000121989 protein_coding       55          2
   [3]     chr2 [148657027, 148657136]      + | ENSG00000121989 protein_coding       263         3
   [4]     chr2 [148657313, 148657467]      + | ENSG00000121989 protein_coding       373         2
   [5]     chr2 [148672760, 148672903]      + | ENSG00000121989 protein_coding       528         1
   [6]     chr2 [148674852, 148674995]      + | ENSG00000121989 protein_coding       672         1
   [7]     chr2 [148676016, 148676161]      + | ENSG00000121989 protein_coding       816         1
   [8]     chr2 [148677799, 148677913]      + | ENSG00000121989 protein_coding       962         3
   [9]     chr2 [148680542, 148680680]      + | ENSG00000121989 protein_coding       1077        1
  [10]     chr2 [148683600, 148683730]      + | ENSG00000121989 protein_coding       1216        2
  [11]     chr2 [148684649, 148684843]      + | ENSG00000121989 protein_coding       1347        1
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

我有兴趣分别查看每个密码子和 [3] 中 [1,2] 位置的坐标。换句话说,我想要 2 个不同的 GRanges 对象,它们看起来大致像这样(这里只是开始)

> grTargetGene_Nonsynonym

GRanges object with X ranges and 7 metadata columns:
   seqnames                 ranges strand |     ensembl_ids   gene_biotype 
      <Rle>              <IRanges>  <Rle> |     <character>    <character> 
   [1]     chr2 [148602722, 148602723]      + | ENSG00000121989 protein_coding
   [2]     chr2 [148602725, 148602726]      + | ENSG00000121989 protein_coding
   [3]     chr2 [148602728, 148602729]      + | ENSG00000121989 protein_coding
   [4]     chr2 [148602731, 148602732]      + | ENSG00000121989 protein_coding



> grTargetGene_Synonym

GRanges object with X ranges and 7 metadata columns:
   seqnames                 ranges strand |     ensembl_ids   gene_biotype 
      <Rle>              <IRanges>  <Rle> |     <character>    <character> 
   [1]     chr2 [148602724, 148602724]      + | ENSG00000121989 protein_coding
   [2]     chr2 [148602727, 148602727]      + | ENSG00000121989 protein_coding
   [3]     chr2 [148602730, 148602730]      + | ENSG00000121989 protein_coding
   [4]     chr2 [148602733, 148602733]      + | ENSG00000121989 protein_coding

我打算通过根据 coding_posstrand 为每个外显子创建一组 granges 的循环来完成它,但我怀疑有更聪明的方法,甚至可能是函数已经可以做到了,但我找不到简单的解决方案。

重要提示:我不需要序列本身(在这种情况下,最简单的方法是先提取 DNA,然后处理序列),但我不需要这样做,我只需要我想要的位置用于与某些功能重叠。

> library("GenomicRanges")
> dput(grTargetGene)

new("GRanges"
, seqnames = new("Rle"
, values = structure(1L, .Label = "chr2", class = "factor")
, lengths = 6L
, elementMetadata = NULL
, metadata = list()
)
, ranges = new("IRanges"
, start = c(148602722L, 148653870L, 148657027L, 148657313L, 148672760L, 
148674852L)
, width = c(55L, 208L, 110L, 155L, 144L, 144L)
, NAMES = NULL
, elementType = "integer"
, elementMetadata = NULL
, metadata = list()
)
, strand = new("Rle"
, values = structure(1L, .Label = c("+", "-", "*"), class = "factor")
, lengths = 6L
, elementMetadata = NULL
, metadata = list()
)
, elementMetadata = new("DataFrame"
, rownames = NULL
, nrows = 6L
, listData = structure(list(ensembl_ids =
c("ENSG00000121989","ENSG00000121989", 
"ENSG00000121989", "ENSG00000121989", "ENSG00000121989", "ENSG00000121989"
), gene_biotype = c("protein_coding", "protein_coding", "protein_coding", 
"protein_coding", "protein_coding", "protein_coding"), cds_length =   
c(1542,1542, 1542, 1542, 1542, 1542), gene_start_position = c(148602086L, 
148602086L, 148602086L, 148602086L, 148602086L, 148602086L), 
gene_end_position = c(148688393L, 148688393L, 148688393L, 
148688393L, 148688393L, 148688393L), prev_exons_length = c(0, 
55, 263, 373, 528, 672), coding_pos = c(1, 2, 3, 2, 1, 1)), .Names =  
c("ensembl_ids", "gene_biotype", "cds_length", "gene_start_position",
"gene_end_position", 
"prev_exons_length", "coding_pos"))
, elementType = "ANY"
, elementMetadata = NULL
, metadata = list()
)
, seqinfo = new("Seqinfo"
, seqnames = "chr2"
, seqlengths = NA_integer_
, is_circular = NA
, genome = NA_character_
)
, metadata = list()
)

下面的怎么样:

grl <- lapply(list(Nonsym = c(1, 2), Sym = c(3, 3)), function(x) {
    ranges(grTargetGene) <- IRanges(
        start = start(grTargetGene) + x[1] - 1,
        end = start(grTargetGene) + x[2] - 1)
    return(grTargetGene) })
grl
#$Nonsym
#GRanges object with 6 ranges and 7 metadata columns:
#      seqnames              ranges strand |     ensembl_ids   gene_biotype
#         <Rle>           <IRanges>  <Rle> |     <character>    <character>
#  [1]     chr2 148602722-148602723      + | ENSG00000121989 protein_coding
#  [2]     chr2 148653870-148653871      + | ENSG00000121989 protein_coding
#  [3]     chr2 148657027-148657028      + | ENSG00000121989 protein_coding
#  [4]     chr2 148657313-148657314      + | ENSG00000121989 protein_coding
#  [5]     chr2 148672760-148672761      + | ENSG00000121989 protein_coding
#  [6]     chr2 148674852-148674853      + | ENSG00000121989 protein_coding
#      cds_length gene_start_position gene_end_position prev_exons_length
#       <numeric>           <integer>         <integer>         <numeric>
#  [1]       1542           148602086         148688393                 0
#  [2]       1542           148602086         148688393                55
#  [3]       1542           148602086         148688393               263
#  [4]       1542           148602086         148688393               373
#  [5]       1542           148602086         148688393               528
#  [6]       1542           148602086         148688393               672
#      coding_pos
#       <numeric>
#  [1]          1
#  [2]          2
#  [3]          3
#  [4]          2
#  [5]          1
#  [6]          1
#  -------
#  seqinfo: 1 sequence from an unspecified genome; no seqlengths
#
#$Sym
#GRanges object with 6 ranges and 7 metadata columns:
#      seqnames    ranges strand |     ensembl_ids   gene_biotype cds_length
#         <Rle> <IRanges>  <Rle> |     <character>    <character>  <numeric>
#  [1]     chr2 148602724      + | ENSG00000121989 protein_coding       1542
#  [2]     chr2 148653872      + | ENSG00000121989 protein_coding       1542
#  [3]     chr2 148657029      + | ENSG00000121989 protein_coding       1542
#  [4]     chr2 148657315      + | ENSG00000121989 protein_coding       1542
#  [5]     chr2 148672762      + | ENSG00000121989 protein_coding       1542
#  [6]     chr2 148674854      + | ENSG00000121989 protein_coding       1542
#      gene_start_position gene_end_position prev_exons_length coding_pos
#                <integer>         <integer>         <numeric>  <numeric>
#  [1]           148602086         148688393                 0          1
#  [2]           148602086         148688393                55          2
#  [3]           148602086         148688393               263          3
#  [4]           148602086         148688393               373          2
#  [5]           148602086         148688393               528          1
#  [6]           148602086         148688393               672          1
#  -------
#  seqinfo: 1 sequence from an unspecified genome; no seqlengths

grl 包含两个 GRangeslist,一个基于位置 1 和 2 的范围,另一个基于位置 3 的范围。

我创建了一个可以解释链并允许处理长度不能被 3 整除(甚至可能小于 3)的外显子的函数

CodonPosition_separation = function(grTargetGene) {
    grTargetGene = sort(grTargetGene)
    grTargetGene$prev_exons_length = c(0,width(grTargetGene)[1:length(grTargetGene)-1])
    if (length(grTargetGene) >1) {
        for (l in 2:length(grTargetGene)) {
          grTargetGene$prev_exons_length[l] = grTargetGene$prev_exons_length[l]+grTargetGene$prev_exons_length[l-1]
        }
      }
  grTargetGene$coding_pos =  grTargetGene$prev_exons_length%%3+1
  grTargetGene_N =  GRanges()
  grTargetGene_S =  GRanges()
  for (l in 1:length(grTargetGene)) {
    for (obj in c("start_nonsyn","start_syn", "end_nonsyn", "end_syn","gr_nonsyn","gr_syn")) {if(exists(obj)) {rm(obj)}}
    if (as.character(strand(grTargetGene)[1]) =="+"){
      start_ns = start(grTargetGene[l])+1-grTargetGene$coding_pos[l]
      end_ns = end(grTargetGene[l])
      if (start_ns <=end_ns) {
        start_nonsyn = seq(from = start(grTargetGene[l])+1-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
        end_nonsyn = seq(from = start(grTargetGene[l])+2-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
      }
      start_s =start(grTargetGene[l])+3-grTargetGene$coding_pos[l]
      end_s = end(grTargetGene[l])
      if (start_s <=end_s) {
        start_syn = seq(from = start(grTargetGene[l])+3-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
        end_syn = start_syn
      }

    } else {
      start_ns = end(grTargetGene[l])-1+grTargetGene$coding_pos[l]
      end_ns = start(grTargetGene[l])
      if (start_ns >=end_ns) {
        start_nonsyn = seq(from = end(grTargetGene[l])-1+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
        end_nonsyn = seq(from = end(grTargetGene[l])-2+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
      }
      start_s =end(grTargetGene[l])-3+grTargetGene$coding_pos[l]
      end_s = start(grTargetGene[l])
      if (start_ns >=end_ns) {
        start_syn = seq(from = end(grTargetGene[l])-3+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
        end_syn = start_syn
      }
    }
    if (exists("start_nonsyn")) {
      length_nonsyn = length(start_nonsyn)+ length(end_nonsyn)
      gr_nonsyn = GRanges(
        seqnames = rep(seqnames(grTargetGene[l]), length_nonsyn),
        strand = rep(strand(grTargetGene[l]), length_nonsyn),
        ranges = IRanges(start = c(start_nonsyn, end_nonsyn), end = c(start_nonsyn, end_nonsyn))
      )
      gr_nonsyn = intersect(gr_nonsyn,grTargetGene[l])
      grTargetGene_N = append(grTargetGene_N, gr_nonsyn)
      } 
    if (exists("start_syn")) {
      length_syn = length(start_syn)
      gr_syn = GRanges(
        seqnames = rep(seqnames(grTargetGene[l]), length_syn),
        strand = rep(strand(grTargetGene[l]), length_syn),
        ranges = IRanges(start = start_syn, end = end_syn)
      )
      gr_syn = intersect(gr_syn,grTargetGene[l])
      grTargetGene_S = append(grTargetGene_S, gr_syn)
    }
  }
  return(list("grTargetGene_S"=grTargetGene_S,"grTargetGene_N"=grTargetGene_N))
}

效果很好:

> CodonPosition_separation(grTargetGene)
$grTargetGene_S
GRanges object with 514 ranges and 0 metadata columns:
        seqnames                 ranges strand
           <Rle>              <IRanges>  <Rle>
    [1]     chr2 [148602724, 148602724]      +
    [2]     chr2 [148602727, 148602727]      +
    [3]     chr2 [148602730, 148602730]      +
    [4]     chr2 [148602733, 148602733]      +
    [5]     chr2 [148602736, 148602736]      +
    ...      ...                    ...    ...
  [510]     chr2 [148684831, 148684831]      +
  [511]     chr2 [148684834, 148684834]      +
  [512]     chr2 [148684837, 148684837]      +
  [513]     chr2 [148684840, 148684840]      +
  [514]     chr2 [148684843, 148684843]      +
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

$grTargetGene_N
GRanges object with 517 ranges and 0 metadata columns:
        seqnames                 ranges strand
           <Rle>              <IRanges>  <Rle>
    [1]     chr2 [148602722, 148602723]      +
    [2]     chr2 [148602725, 148602726]      +
    [3]     chr2 [148602728, 148602729]      +
    [4]     chr2 [148602731, 148602732]      +
    [5]     chr2 [148602734, 148602735]      +
    ...      ...                    ...    ...
  [513]     chr2 [148684829, 148684830]      +
  [514]     chr2 [148684832, 148684833]      +
  [515]     chr2 [148684835, 148684836]      +
  [516]     chr2 [148684838, 148684839]      +
  [517]     chr2 [148684841, 148684842]      +
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths