列出序列(DNA)的所有突变

Making a list of all mutations of a sequence (DNA)

我有一个 DNA 序列,我想找到它的所有实例,或者在 DNA 序列读数列表中找到它的任何可能突变。我正在使用 grepl 来执行此操作,因为在我使用它的实例中它比 matchPattern 更快。我使用 parLapply 将我的突变向量提供给 grepl 函数。但我感兴趣的是制作一种自动生成我的序列突变向量的简单方法。最初我输入了每个突变,但这留下了人为错误的空间,如果序列被加长,就需要输入更多的突变。此外,我当前的代码只允许 1 个突变,一些序列应该允许比其他序列更多的突变。我不是在找人为我写一个循环,而是给我一个解释任何字符串的建议。

现在,我有一种半自动化的方法来生成突变。它现在无需我全部输入即可生成向量,但仅适用于 8 个核苷酸长的序列。必须有更好的方法来为任何长度的任何核苷酸序列生成载体。

这是我的代码:

#My sequence of interest
seq1 <- "GGCGACTG"
lenseq1 <- nchar(seq1)

#A vector of the length of the sequence I wish to create all mutations of
mutsinseq1 <- rep(seq1, 5*lenseq1+4*(lenseq1-1)+1)

#The possible substitutions, insertions, and deletions to the sequence of interest
possnuc <- c("A","T","C","G","")
lenpossnuc <- length(possnuc)

#changing all elements of the vector except for the first
#the first 8 if statements are nucleotide substitutions or deletions
#the other if statements allow for inserts between nucleotides
for(i in 2:length(mutsinseq1)){
  if(i<7){
    mutsinseq1[i] <- paste(possnuc[i-1],substr(seq1,2,lenseq1),sep = "") 
  } else if(i<12){
    mutsinseq1[i] <- paste(substr(seq1,1,1),possnuc[i-6],substr(seq1,3,lenseq1),sep = "")
  } else if(i<17){
    mutsinseq1[i] <- paste(substr(seq1,1,2),possnuc[i-11],substr(seq1,4,lenseq1),sep = "")
  } else if(i<22){
    mutsinseq1[i] <- paste(substr(seq1,1,3),possnuc[i-16],substr(seq1,5,lenseq1),sep = "")
  } else if(i<27){
    mutsinseq1[i] <- paste(substr(seq1,1,4),possnuc[i-21],substr(seq1,6,lenseq1),sep = "")
  } else if(i<32){
    mutsinseq1[i] <- paste(substr(seq1,1,5),possnuc[i-26],substr(seq1,7,lenseq1),sep = "")
  } else if(i<37){
    mutsinseq1[i] <- paste(substr(seq1,1,6),possnuc[i-31],substr(seq1,8,lenseq1),sep = "")
  } else if(i<42){
    mutsinseq1[i] <- paste(substr(seq1,1,7),possnuc[i-36],sep = "")
  } else if(i<46){
    mutsinseq1[i] <- paste(substr(seq1,1,1),possnuc[i-41],substr(seq1,2,lenseq1),sep = "")
  } else if(i<50){
    mutsinseq1[i] <- paste(substr(seq1,1,2),possnuc[i-45],substr(seq1,3,lenseq1),sep = "")
  } else if(i<54){
    mutsinseq1[i] <- paste(substr(seq1,1,3),possnuc[i-49],substr(seq1,4,lenseq1),sep = "")
  } else if(i<58){
    mutsinseq1[i] <- paste(substr(seq1,1,4),possnuc[i-53],substr(seq1,5,lenseq1),sep = "")
  } else if(i<62){
    mutsinseq1[i] <- paste(substr(seq1,1,5),possnuc[i-57],substr(seq1,6,lenseq1),sep = "")
  } else if(i<66){
    mutsinseq1[i] <- paste(substr(seq1,1,6),possnuc[i-61],substr(seq1,7,lenseq1),sep = "")
  } else{
    mutsinseq1[i] <- paste(substr(seq1,1,7),possnuc[i-65],substr(seq1,8,lenseq1),sep = "")
  }
}

#getting rid of duplicate mutations
mutsinseq1 <- mutsinseq1[-which(duplicated(mutsinseq1))]

以下是我希望生成的(由我当前的代码生成):

mutsinseq1
[1] "GGCGACTG"  "AGCGACTG"  "TGCGACTG"  "CGCGACTG"  "GCGACTG"   "GACGACTG"  "GTCGACTG"  "GCCGACTG"  "GGAGACTG"  "GGTGACTG"  "GGGGACTG"  "GGGACTG"   "GGCAACTG" 
[14] "GGCTACTG"  "GGCCACTG"  "GGCACTG"   "GGCGTCTG"  "GGCGCCTG"  "GGCGGCTG"  "GGCGCTG"   "GGCGAATG"  "GGCGATTG"  "GGCGAGTG"  "GGCGATG"   "GGCGACAG"  "GGCGACCG" 
[27] "GGCGACGG"  "GGCGACG"   "GGCGACTA"  "GGCGACTT"  "GGCGACTC"  "GGCGACT"   "GAGCGACTG" "GTGCGACTG" "GCGCGACTG" "GGGCGACTG" "GGACGACTG" "GGTCGACTG" "GGCCGACTG"
[40] "GGCAGACTG" "GGCTGACTG" "GGCGGACTG" "GGCGAACTG" "GGCGTACTG" "GGCGCACTG" "GGCGATCTG" "GGCGACCTG" "GGCGAGCTG" "GGCGACATG" "GGCGACTTG" "GGCGACGTG" "GGCGACTAG"
[53] "GGCGACTCG" "GGCGACTGG"

我该如何解决这个问题?

在其他语言中,您可能会使用一系列嵌套循环来执行此操作,但在 R 中,有一些不错的组合函数。这是做你想做的事情的整体功能:

library(stringr)
library(purrr)
library(dplyr)

mutate_sequence <- function(string, num = 1, nucleotides = c("A","T","C","G","_")) {
  l_str <- str_length(string)

  choices <- cross(list(
    cols = combn(seq_len(l_str), num, simplify = F),
    muts = cross(rerun(num, nucleotides)) %>% map(unlist)
  ))

  choice_matrix <- 
    map_dfr(choices, as_tibble, .id = "rows") %>% 
    mutate(rows = as.numeric(rows))

  seq_matrix <- str_split(rep(string, max(choice_matrix$rows)), "", simplify = T)

  seq_matrix[as.matrix(choice_matrix[,1:2])] <- str_to_lower(choice_matrix$muts)
  apply(seq_matrix, 1, paste, collapse = "")
}

我使用了一些包来让事情变得更容易一些,但它们都可以翻译成基础 R。

这是示例输出:

mutate_sequence("ATCG", num = 2)
  [1] "aaCG" "aTaG" "aTCa" "AaaG" "AaCa" "ATaa" "taCG" "tTaG" "tTCa" "AtaG" "AtCa" "ATta" "caCG" "cTaG"
 [15] "cTCa" "AcaG" "AcCa" "ATca" "gaCG" "gTaG" "gTCa" "AgaG" "AgCa" "ATga" "_aCG" "_TaG" "_TCa" "A_aG"
 [29] "A_Ca" "AT_a" "atCG" "aTtG" "aTCt" "AatG" "AaCt" "ATat" "ttCG" "tTtG" "tTCt" "AttG" "AtCt" "ATtt"
 [43] "ctCG" "cTtG" "cTCt" "ActG" "AcCt" "ATct" "gtCG" "gTtG" "gTCt" "AgtG" "AgCt" "ATgt" "_tCG" "_TtG"
 [57] "_TCt" "A_tG" "A_Ct" "AT_t" "acCG" "aTcG" "aTCc" "AacG" "AaCc" "ATac" "tcCG" "tTcG" "tTCc" "AtcG"
 [71] "AtCc" "ATtc" "ccCG" "cTcG" "cTCc" "AccG" "AcCc" "ATcc" "gcCG" "gTcG" "gTCc" "AgcG" "AgCc" "ATgc"
 [85] "_cCG" "_TcG" "_TCc" "A_cG" "A_Cc" "AT_c" "agCG" "aTgG" "aTCg" "AagG" "AaCg" "ATag" "tgCG" "tTgG"
 [99] "tTCg" "AtgG" "AtCg" "ATtg" "cgCG" "cTgG" "cTCg" "AcgG" "AcCg" "ATcg" "ggCG" "gTgG" "gTCg" "AggG"
[113] "AgCg" "ATgg" "_gCG" "_TgG" "_TCg" "A_gG" "A_Cg" "AT_g" "a_CG" "aT_G" "aTC_" "Aa_G" "AaC_" "ATa_"
[127] "t_CG" "tT_G" "tTC_" "At_G" "AtC_" "ATt_" "c_CG" "cT_G" "cTC_" "Ac_G" "AcC_" "ATc_" "g_CG" "gT_G"
[141] "gTC_" "Ag_G" "AgC_" "ATg_" "__CG" "_T_G" "_TC_" "A__G" "A_C_" "AT__"

我将突变设为小写或“_”以使其在何处显而易见,但您可以轻松更改它以将它们恢复为 "normal" 序列。

所以每一行都做了一些事情:

l_str <- str_length(string)

获取字符串中的字符数。

combn(seq_len(l_str), num, simplify = F)

1) 这是沿序列(索引)的所有可能位置组合,一次取 num,用于突变数。

rerun(num, nucleotides)

2) 这会重复您的核苷酸向量 num 次,并使其成为一个列表。 cross(rerun(num, nucleotides)) 然后为您提供该列表中的每个组合作为列表,因此您将采用每个可能的核苷酸组合,并重复。 cross(rerun(num, nucleotides)) %>% map(unlist) 将列表的最深层折叠成向量。

因此,最后两个块为您提供了所有可能的位置选择,然后是所有可能的替换组合。然后我们需要这些成对的所有可能组合!

  choices <- cross(list(
    cols = combn(seq_len(l_str), num, simplify = F),
    muts = cross(rerun(num, nucleotides)) %>% map(unlist)
  ))

对于上面的输出,这意味着:

[[1]]
[[1]]$`cols`
[1] 1 2

[[1]]$muts
[1] "A" "A"


[[2]]
[[2]]$`cols`
[1] 1 2

[[2]]$muts
[1] "T" "A"
...

所以首先对于位置1/2,它给我们A/AT/A, G/A, C/A, _/A,等等。然后每个组合再次为位置1/3,然后是位置1/4,然后是2/ 3,然后2/4,然后3/4.

现在你有了这么长的列表,让我们把它变成更好的东西。首先,我们用 colsmuts 将每个元素制作成一个数据帧,然后将它们全部绑定到一个单独的数据帧中,每个元素的标识符称为 rows:

map_dfr(choices, as_tibble, .id = "rows")
# A tibble: 50 x 3
   rows   cols muts 
   <chr> <int> <chr>
 1 1         1 A    
 2 1         2 A    
 3 2         1 T    
 4 2         2 A    
 5 3         1 C    
 6 3         2 A    
 7 4         1 G    
 8 4         2 A    
 9 5         1 _    
10 5         2 A
# ... with 40 more rows

这给了我们一个长数据框。每个 rows 是一个输出字符串, cols 告诉我们字符串中的哪个位置将被替换。 muts 是将出现在这些位置的字符。为了稍后进行子集化,我们将使用 mutate(...).

rows 转换为数字
seq_matrix <- str_split(rep(string, max(choice_matrix$rows)), "", simplify = T)

现在我们使用您的原始字符串并重复多次 choice_matrix 告诉我们我们将有变异的序列。然后我们获取该向量,并沿字符边界拆分每个向量:

      [,1] [,2] [,3] [,4]
 [1,] "A"  "T"  "C"  "G" 
 [2,] "A"  "T"  "C"  "G" 
 [3,] "A"  "T"  "C"  "G" 
 [4,] "A"  "T"  "C"  "G" 
 [5,] "A"  "T"  "C"  "G" 
 [6,] "A"  "T"  "C"  "G" 
 ...

现在我们有了一个大矩阵,R 在这些大矩阵上的运算速度很快。我们本可以用矩阵运算完成所有其他步骤,但这似乎比使用这个列表组合函数要多。

seq_matrix[as.matrix(choice_matrix[,1:2])] <- str_to_lower(choice_matrix$muts)

这根据choice_matrix中的rowscols来标识每个位置。然后它将 muts 中的适当值放入其中。这也是您可以删除 str_to_lower 以防止它们变为小写的地方。您需要更改 nucleotides 的默认参数,使 "_" 变为 "".

       [,1] [,2] [,3] [,4]
  [1,] "a"  "a"  "C"  "G" 
  [2,] "a"  "T"  "a"  "G" 
  [3,] "a"  "T"  "C"  "a" 
  [4,] "A"  "a"  "a"  "G" 
  [5,] "A"  "a"  "C"  "a" 
  [6,] "A"  "T"  "a"  "a" 
  ...

所以第 1 行在位置 1 和 2 得到 "A" 和 "A"。然后第 2 行在位置 1 和 3 得到 "A" 和 "A",等等。现在我们只需要 apply 跨越每一行(这就是 apply(..., 1, ...) 中的 1 所做的)一个将每一行组合成一个字符串的函数。那将是 paste(..., collapse = "").

这将使您快速获得巨大的产出。如果你对你原来的 8 个核苷酸序列进行 3 次突变,你会得到 7000 的输出。4 次突变是 43750。而且每次都会慢得多,大约需要 5 秒才能 运行 我桌面上的 4 次突变。您可以预先计算输出长度,即 choose(l_str, num) * length(nucleotides)^num.


再次更新:

为了处理插入和删除,我们只需要字符矩阵为每个可能的插入都有一个槽。这是那个版本:

mutate_sequence <- function(string, num = 1, nucleotides = c("A","T","C","G","")) {
  if (num < 1) {return(string)}

  l_str <- str_length(string)
  l_pos <- (num + 1)*(l_str - 1) + 1

  choices <- cross(list(
    cols = combn(seq_len(l_pos), num, simplify = F),
    muts = cross(rerun(num, nucleotides)) %>% map(unlist)
  ))

  choice_matrix <- 
    map_dfr(choices, as_data_frame, .id = "rows") %>% 
    mutate(rows = as.numeric(rows))

  blanks <- character(l_pos)
  orig_pos <- (seq_len(l_str) - 1) * (num+1) + 1
  blanks[orig_pos] <- str_split(string, "", simplify = T)

  seq_matrix <- matrix(
    rep(blanks, max(choice_matrix$rows)), 
    ncol = l_pos, byrow = T
    )

  seq_matrix[as.matrix(choice_matrix[,1:2])] <- str_to_lower(choice_matrix$muts)
  sequences <- apply(seq_matrix, 1, paste, collapse = "")
  sequences[!duplicated(str_to_upper(sequences))]
}

这与上述函数的版本基本相同,但首先您创建一个空白向量,其中有足够的点用于每次插入。对于每个原始核苷酸,您需要在其后插入一个额外的点,最后一个点除外。这适用于 l_pos <- (num + 1)*(l_str - 1) + 1 个职位。 character(l_pos) 给你空白,然后你用 (seq_len(l_str) - 1) * (num+1) + 1.

处的原始核苷酸填充空白

例如,具有两个突变的 ATCG 变为 "A" "" "" "T" "" "" "C" "" "" "G"。函数的其余部分工作相同,只是将每个可能的核苷酸(或删除)放在每个可能的位置。

paste 将其全部组合在一起之前的输出如下所示:

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "a"  "a"  ""   "T"  ""   ""   "C"  ""   ""   "G"  
[2,] "a"  ""   "a"  "T"  ""   ""   "C"  ""   ""   "G"  
[3,] "a"  ""   ""   "a"  ""   ""   "C"  ""   ""   "G"  
[4,] "a"  ""   ""   "T"  "a"  ""   "C"  ""   ""   "G"  
[5,] "a"  ""   ""   "T"  ""   "a"  "C"  ""   ""   "G" 
...  

然后在 paste 每一行之后,我们可以用 duplicated 检查重复并排除那些。我们也可以去掉小写字母突变,改用 unique(sequences)。现在输出比以前短多了,前55 of 278:

  [1] "aaTCG"  "aaCG"   "aTaCG"  "aTaG"   "aTCaG"  "aTCa"   "AaaTCG" "AaaCG"  "AaTaCG" "AaTaG"  "AaTCaG"
 [12] "AaTCa"  "AaaG"   "AaCaG"  "AaCa"   "ATaaCG" "ATaaG"  "ATaCaG" "ATaCa"  "ATaa"   "ATCaaG" "ATCaa" 
 [23] "taTCG"  "taCG"   "tTaCG"  "tTaG"   "tTCaG"  "tTCa"   "AtaTCG" "AtTaCG" "AtTaG"  "AtTCaG" "AtTCa" 
 [34] "ATta"   "ATCtaG" "ATCta"  "caTCG"  "caCG"   "cTaCG"  "cTaG"   "cTCaG"  "cTCa"   "AcaTCG" "AcaCG" 
 [45] "AcTaCG" "AcTaG"  "AcTCaG" "AcTCa"  "AcaG"   "AcCaG"  "AcCa"   "ATcaCG" "ATcCaG" "ATcCa"  "gaTCG"
...

EDITED 完全修改 third 时间以更好地解决问题!顺便说一句,这里的关键解决方案(以三个辅助函数的形式)不需要 Biostrings 包。

据我了解,一个短的 DNA 查询序列要与大量参考 DNA 序列进行匹配。这里的转折是要在参考 DNA 序列中搜索 DNA 查询序列上以插入或删除形式出现的任意数量的变异。

Biostrings 包中的函数 vmatchPattern() 可以识别给定模式与一组参考序列中任意数量的 不匹配 的匹配。此外,vmatchPattern() 可以识别给定模式与可能的 插入或删除 (indel) 的匹配。但是,与 matchPattern() 不同的是,vmatchPattern() 不能 同时进行。

此处寻求的解决方案是生成 DNA 查询序列的生成变体,然后可以将其传递给搜索函数,例如 grepl() 或此处建议的 vmatchPattern().

此处发布的修订解决方案包括三个功能。 makeDel() 将生成具有任意数量删除的短序列的所有可能变体。伴随函数 makeIns() 将生成短序列的变体,插入指定为 symbol 中的 IUPAC 符号。 makeSub() 将使用 symbol 中 IUPAC 代码指定的碱基进行所需的替换。这种方法生成其他碱基的所有可能组合,允许字符串直接用于模式匹配函数,包括 vmatchPaterrn.

如果要使用它,这确保包 Biostrings 可用。此代码适用于 3.60 及更高版本的 R。

  if (!require("Biostrings")) {
    if (!requireNamespace("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
    BiocManager::install("Biostrings")
  }
  library(Biostrings)

现在一些测试数据。将使用原始查询序列 "GGCGACTG" 因为 "query" 和 200 到 400 个核苷酸之间的 1000 个随机序列将用作参考集。

  seq1 <- "GGCGACTG"

  set.seed(1234)
  ref <- replicate(1e3,
    sample(c("A", "C", "G", "T"), sample(200:400, 1), replace = TRUE),
    simplify = FALSE)
  ref <- sapply(ref, paste, collapse = "")
  ref <- DNAStringSet(ref)

在继续解决问题之前,让我们先看看单独使用模式可以找到什么。

# how times does the pattern occur? 
  table(vcountPattern(seq1, ref)) # just 3 times
>   0   1 
> 997   3 

# how many times allowing for one mismatch?
# once in 96 sequences and twice in three sequences
  n <- vcountPattern(seq1, ref, max.mismatch = 1)
  table(n)
> n
>   0   1   2 
> 901  96   3 

# examine the matched sequences
  m <- vmatchPattern(seq1, ref, max.mismatch = 1) # find the patterns
  sel <- which.max(n) # select the first one with 2 matches
  Views(ref[[sel]], m[[sel]]) # examine the matches
>   Views on a 384-letter DNAString subject
> subject: TCGCGTCGCACTTCTGCTAACACAGC...GCCCAGTCGACTGCTGCTCGGATTGC
> views:
>     start end width
> [1]   104 111     8 [GGCGACCG]
> [2]   364 371     8 [GTCGACTG] 

这是生成变体的三个辅助函数。参数 seq 可以是字符串,例如 "GGGCGACTG" 或 DNAString 对象。参数 n 是一个整数,指定删除、插入或替换的上限。这些函数将创建具有 0、1、...、n 个删除、插入或替换的变体。如果 n 设置为 0,函数将 return 原始序列。 makeIns()makeSub() 的参数 symbol 应该是单个 IUPAC 字符,以指定将插入或替换哪些碱基。 "N" 的默认值指定了所有可能的基数("A"、"C"、"G" 和 "T")。

makeDel()combn()标识删除位置。 makeIns()makeSub() 的逻辑有点复杂,因为需要允许插入彼此相邻并且需要创建所有组合。这里我选择不是在查询序列的开头或结尾添加插入。

所有函数 return 适合在 vmatchPattern()grep() 中使用的字符向量。

要在 DNA 字符串中创建删除:

  ##
  ## makeDel - create 1:n deletions in a character string (DNA sequence)
  ##  return a character vector of all possible variants
  ##
  makeDel <- function(seq, n) {
  # accept only a single value for 'seq'
    cseq <- as.character(seq)
    cseq <- unlist(strsplit(cseq[1], ""))
    nseq <- length(cseq)

  # simple argument checks
    if (!is(n, "numeric")) stop("'n' must be an integer")
    if (n == 0) return(paste(cseq, collapse = ""))
    if (n >= nseq) stop("too many deletions for ", nseq, " letters")

  # create all possible combinations to be dropped in 'index'
    index <- lapply(seq_len(n), function(j) combn(nseq, j, simplify = FALSE))
    index <- unlist(index, recursive = FALSE)

  # drop base in each possible position and reassemble
    ans <- lapply(index, function(idx) cseq[-idx])
    ans <- sapply(ans, paste, collapse = "")
    ans <- unique(ans) # remove duplicates
    return(ans)
  }

要在 DNA 字符串中创建插入:

  ##
  ## makeIns - create 1:n insertions into DNA string (character vector)
  ##   where each insertion is one of a given IUPAC-specified symbol
  ##   return a character vector of all possible variants
  ##
  makeIns <- function(seq, n, symbol = "N") {
  # IUPAC codes for ambiguous bases
    iupac <- c(N = "ACGT", A = "A", C = "C", G = "G", T = "T", M = "AC", R = "AG",
      W = "AT", S = "CG", Y = "CT", K = "GT", V = "ACG", H = "ACT",
      D = "AGT", B = "CGT")

 # only accept single value for 'seq'
    cseq <- as.character(seq)
    cseq <- unlist(strsplit(cseq[1], ""))
    nseq <- length(cseq)

 # simple argument checks
    if (!is(n, "numeric")) stop("'n' must be an integer")
    symbol <- toupper(symbol)
    if (nchar(symbol) != 1 | !symbol %in% names(iupac))
      stop("'symbol' must be a single valid IUPAC symbol")
    if (n == 0) return(paste(cseq, collapse = ""))
    if (n >= nseq) stop("seems like too many insertions for ", nseq, " letters")

  # which bases are to be inserted?
    ACGT <- strsplit(iupac[symbol], "")[[1]]

  # create all possible combinations of positions for the insertion 
    ipos <- seq_len(nseq - 1) # insert after this position
    index <- lapply(1:n, function(i) do.call(expand.grid, rep(list(ipos), i)))
    index <- lapply(index, function(v) split(v, seq_len(nrow(v))))
    index <- unlist(index, recursive = FALSE)
    index <- lapply(index, unlist)
    index <- lapply(index, sort)

  # place the required number of insertions after each position in index
    res <- lapply(index, function(idx) {
      tally <- rle(idx)$lengths
      breaks <- diff(c(0, idx, nseq))
      NN <- Map(base::rep, symbol, tally)
      spl <- split(cseq, rep(seq_along(breaks), breaks))
      sel <- head(seq_along(spl), -1)
      spl[sel] <- Map(base::c, spl[sel], NN)
      ans <- unlist(spl)
      if (length(ACGT) > 1) { # replicate and replace with appropriate bases
        sites <- grep(symbol, ans)
        nsites <- length(sites)
        nsymbol <- length(ACGT)

        bases <- expand.grid(rep(list(ACGT), nsites), stringsAsFactors = FALSE)
        bases <- as.matrix(bases)
        nvars <- nrow(bases)

        ans <- do.call(rbind, rep(list(ans), nvars))
        ans[, sites] <- bases
        ans <- split(ans, seq_len(nvars))
        ans <- lapply(ans, paste, collapse = "")
      }
      else
        ans <- paste(ans, collapse = "")
      return(ans)
    })
    res <- unlist(res)
    res <- unique(res)
    return(res)
  }

要在 DNA 字符串中创建替换:

  ##
  ## makeSub - create an arbitrary number of substitutions in each 1:n positions
  ##   with the IUPAC bases specified by 'symbol'
  ##   return a character vector with all possible variants
  ##
  makeSub <- function(seq, n, symbol = "N")
  {
  # IUPAC codes for ambiguous bases
    iupac <- c(N = "ACGT", A = "A", C = "C", G = "G", T = "T", M = "AC", R = "AG",
      W = "AT", S = "CG", Y = "CT", K = "GT", V = "ACG", H = "ACT",
      D = "AGT", B = "CGT")

  # accept only a single value for 'seq'
    cseq <- as.character(seq)
    cseq <- unlist(strsplit(cseq[1], ""))
    nseq <- length(cseq)

  # simple argument checks
    if (!is(n, "numeric")) stop("'n' must be an integer")
    symbol <- toupper(symbol)
    if (nchar(symbol) != 1 | !symbol %in% names(iupac))
      stop("'symbol' must be a single valid IUPAC symbol")
    if (n == 0) return(paste(cseq, collapse = ""))
    if (n > nseq) stop("too many substitutions for ", nseq, " bases")

  # which bases are to be used for the substitution?
    ACGT <- strsplit(iupac[symbol], "")[[1]]

  # create all possible combinations of positions to be changed in 'index'
    index <- lapply(seq_len(n), function(j) combn(nseq, j, simplify = FALSE))
    index <- unlist(index, recursive = FALSE)

  # for each numeric vector in index, create as many variants as
  # alternative bases are needed, collect in 'ans'
    ans <- lapply(index, function(idx) {
      bases <- lapply(cseq[idx], function(v) setdiff(ACGT, v))
      bases <- bases[sapply(bases, length) > 0] # defensive 
      bases <- expand.grid(bases, stringsAsFactors = FALSE)
      bases <- as.matrix(bases)
      nvars <- nrow(bases)

      vars <- do.call(rbind, rep(list(cseq), nvars))
      vars[ ,idx] <- bases
      if (!is.null(vars))
        return(split(vars, seq_len(nvars)))
    })
    ans <- unlist(ans, recursive = FALSE)
    ans <- sapply(ans, paste, collapse = "")
    ans <- unique(ans) # remove duplicates
    return(ans)
  }

输出示例:

  makeDel(seq1, 0)
> [1] "GGCGACTG"

  makeDel(seq1, 1)
> [1] "GCGACTG" "GGGACTG" "GGCACTG" "GGCGCTG" "GGCGATG" "GGCGACG" "GGCGACT"

  makeDel(seq1, 2)
>  [1] "GCGACTG" "GGGACTG" "GGCACTG" "GGCGCTG" "GGCGATG" "GGCGACG" "GGCGACT"
>  [8] "CGACTG"  "GGACTG"  "GCACTG"  "GCGCTG"  "GCGATG"  "GCGACG"  "GCGACT" 
> [15] "GGGCTG"  "GGGATG"  "GGGACG"  "GGGACT"  "GGCCTG"  "GGCATG"  "GGCACG" 
> [22] "GGCACT"  "GGCGTG"  "GGCGCG"  "GGCGCT"  "GGCGAG"  "GGCGAT"  "GGCGAC" 

  makeIns(seq1, 1) # default form
>  [1] "GAGCGACTG" "GCGCGACTG" "GGGCGACTG" "GTGCGACTG" "GGACGACTG" "GGCCGACTG"
>  [7] "GGTCGACTG" "GGCAGACTG" "GGCGGACTG" "GGCTGACTG" "GGCGAACTG" "GGCGCACTG"
> [13] "GGCGTACTG" "GGCGACCTG" "GGCGAGCTG" "GGCGATCTG" "GGCGACATG" "GGCGACGTG"
> [19] "GGCGACTTG" "GGCGACTAG" "GGCGACTCG" "GGCGACTGG"

  makeIns(seq1, 1, symbol = "Y") # inserting only "C" or "T"
>  [1] "GCGCGACTG" "GTGCGACTG" "GGCCGACTG" "GGTCGACTG" "GGCTGACTG" "GGCGCACTG"
>  [7] "GGCGTACTG" "GGCGACCTG" "GGCGATCTG" "GGCGACTTG" "GGCGACTCG"

  makeSub("AAA", 1)
> [1] "CAA" "GAA" "TAA" "ACA" "AGA" "ATA" "AAC" "AAG" "AAT"

  makeSub("AAA", 2)
>  [1] "CAA" "GAA" "TAA" "ACA" "AGA" "ATA" "AAC" "AAG" "AAT" "CCA" "GCA" "TCA"
> [13] "CGA" "GGA" "TGA" "CTA" "GTA" "TTA" "CAC" "GAC" "TAC" "CAG" "GAG" "TAG"
> [25] "CAT" "GAT" "TAT" "ACC" "AGC" "ATC" "ACG" "AGG" "ATG" "ACT" "AGT" "ATT"

这些函数可以与 vmatchPattern() 一起使用来创建变体和提取匹配项。一种建议的方法是 首先 使用 max.mismatch = 1 找到那些不匹配的序列。 Next,使用 vmatchPattern()fixed = FALSE 查找包含删除和插入的序列,max.mismatch.

的默认值为 0

或者,辅助函数生成的显式模式可以并行传递给grep进程运行!下面显示了 vmatchPattern 的用法,但 可能有理由使用不同的工具进行分析。 请参阅有关此主题的评论。

# first, allow mismatches to the original pattern
# the result is a "ByPos_MIndex" object of length 1000
  m1 <- vmatchPattern(seq1, ref, max.mismatch = 1) # as before...
  n1 <- elementNROWS(m1) # counts the number of elements (matches)
  which(n1 > 0) # which of the 1000 ref sequence had a match with 0 or 1 mismatches?
>  [1]  14  71  73  76  79  88  90 108 126 129 138 141 150 160 163 179 180 195 200
> [20] 205 212 225 227 239 241 246 247 255 276 277 280 299 310 335 338 345 347 357
> [39] 359 369 378 383 387 390 391 404 409 410 414 418 469 472 479 488 499 509 523
> [58] 531 533 567 571 574 580 588 590 591 594 601 634 636 646 654 667 679 685 694
> [77] 696 713 717 732 734 737 749 750 761 762 783 815 853 854 857 903 929 943 959
> [96] 969 981 986 998

# Second search each of the patterns with lapply
# generates seven lists of objects, each of length 10000
  pat2 <- makeDel(seq1, 1)
  m2 <- lapply(pat2, function(pat) vmatchPattern(pat, ref))

# generates 22 lists of objects, each of length 10000
  pat3 <- makeIns(seq1, 1)
  m3 <- lapply(pat3, function(pat) vmatchPattern(pat, ref))

m2m3 中的第二个和第三个结果是 "ByPos_MIndex" 个对象的列表。下面的示例从 m2 中提取匹配项的数量,并以 str() 的缩写形式显示这些匹配项。列表中的每个值标识与相应模式至少有一个匹配的参考序列。

  n2 <- lapply(m2, elementNROWS)
  str(sapply(n2, function(n) which(n > 0)))
> List of 7
>  $ : int [1:14] 14 138 179 335 369 391 567 679 713 734 ...
>  $ : int [1:18] 138 200 240 298 310 343 510 594 598 599 ...
>  $ : int [1:15] 21 26 45 60 260 497 541 600 607 642 ...
>  $ : int [1:17] 27 54 120 121 123 132 210 242 244 257 ...
>  $ : int [1:18] 15 33 110 126 154 419 528 539 546 606 ...
>  $ : int [1:12] 24 77 79 139 525 588 601 679 770 850 ...
>  $ : int [1:15] 179 345 378 414 469 571 574 580 591 713 ...

最后一个示例通过相同的机制检查 22 个 "ByPos_MIndex" 对象 (m3) 的第三个列表。显示22个变体中,有的匹配失败,有的匹配一次,有5个匹配两次。

    n3 <- lapply(m3, elementNROWS) # extract all counts
    names(n3) <- sprintf("pat_%02d", seq_along(n3)) # for convenience
    str(lapply(n3, function(n) which(n > 0)))
> List of 22
>  $ pat_01: int 679
>  $ pat_02: int 391
>  $ pat_03: int(0) 
>  $ pat_04: int 737
>  $ pat_05: int(0) 
>  $ pat_06: int(0) 
>  $ pat_07: int 108
>  $ pat_08: int 276
>  $ pat_09: int 439
>  $ pat_10: int [1:2] 764 773
>  $ pat_11: int(0) 
>  $ pat_12: int [1:2] 22 820
>  $ pat_13: int 795
>  $ pat_14: int [1:2] 914 981
>  $ pat_15: int(0) 
>  $ pat_16: int 112
>  $ pat_17: int 884
>  $ pat_18: int(0) 
>  $ pat_19: int [1:2] 345 378
>  $ pat_20: int [1:2] 571 854
>  $ pat_21: int 574
>  $ pat_22: int(0) 

不用说,为了提取序列信息,仍有大量数据争论。这可以使用 matchPattern 的帮助页面进行编码,并对 help("lowlevel-matching", package = "Biostrings").

中描述的模式匹配逻辑有一定的了解。

尽管 Biostrings 中的例程使用非常快速且非常节省内存的算法来处理大型序列。 Joe 似乎发现原始搜索在其他情况下更快。总有更多需要学习!