使用正则表达式和基于匹配部分的列名将字符串列拆分为字符串矩阵

Split column of strings into matrix of strings using regex with colnames based on part of match

我正在根据蛋白质编码序列的 FASTA 文件中的基因描述创建 tibble。这是我要处理的一些示例数据

seqDescriptions<- c( "lcl|NC_003888.3_cds_NP_624362.1_1 [locus_tag=SCO0001 ][db_xref=GeneID:1095448] [protein=hypothetical protein] [protein_id=NP_624362.1] [location=446..1123] [gbkey=CDS]", 
"lcl|NC_003888.3_cds_NP_624363.1_2 [locus_tag=SCO0002] [db_xref=GeneID:1095447] [protein=hypothetical protein] [protein_id=NP_624363.1] [location=1252..3813] [gbkey=CDS]",
"lcl|NC_003888.3_cds_NP_624364.1_3 [locus_tag=SCO0003] [db_xref=GeneID:1095446] [protein=DNA-binding protein] [protein_id=NP_624364.1] [location=3869..6220] [gbkey=CDS]",
"lcl|NC_003888.3_cds_NP_631871.1_4 [locus_tag=SCO0004] [db_xref=GeneID:1095445] [protein=hypothetical protein] [protein_id=NP_631871.1] [location=6226..7173] [gbkey=CDS]")

我想提取出一列中的初始非space 字符集,然后提取每个标签右侧的信息。手动定义标签

tagList <- c("locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")

我的目标是拥有一个像这样的 tibble

# A tibble: 4 x 7
name                                  locus_tag    db_xref       protein ...
<chr>                                 <chr>         <chr>        <chr>  ...
"lcl|NC_003888.3_cds_NP_624362.1_1"  "SCO0001"  "GeneID:1095448" "hypothetical protein" ...
"lcl|NC_003888.3_cds_NP_624363.1_2"  "SCO0002"  "GeneID:1095447" "hypothetical protein" ...

下面的代码有效,但我想

  1. 查看如何以 tidyr 方式实施。
  2. 让列在构造时使用 tag 的值命名,而不是事后命名。
  3. 了解可以更直接地执行此操作的任何生物信息学工具,例如无需手动定义标签。

.

fastaID <- sub("^(\S+) .*", "\1", seqDescriptions) 
seqTags <- sub("^\S+ (.*)", "\1", seqDescriptions)

dBase <- tibble(fasta_ID=fastaID)
for(tag in tagsUsed){
    tagPattern  <- paste0(".*\[", tag, "=([^]]+).*")## Don't need to escape ']' with '^'
    dBase <- tibble::add_column(dBase, sub(tagPattern, "\1", seqTags), .name_repair="unique" )
}

names(dBase) <- c("fasta_ID", tagsUsed)


tibble(tagsUsed))

这是一种混合方法:

# add "name" for the first column
tagList <- c("name", "locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")
rough <- strcapture("^([^]]+)\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])",
                    seqDescriptions, proto=setNames(rep("", length(tagList)), tagList))
rough
#                                 name            locus_tag                  db_xref                        protein               protein_id              location       gbkey
# 1 lcl|NC_003888.3_cds_NP_624362.1_1  [locus_tag=SCO0001 ] [db_xref=GeneID:1095448] [protein=hypothetical protein] [protein_id=NP_624362.1]  [location=446..1123] [gbkey=CDS]
# 2 lcl|NC_003888.3_cds_NP_624363.1_2   [locus_tag=SCO0002] [db_xref=GeneID:1095447] [protein=hypothetical protein] [protein_id=NP_624363.1] [location=1252..3813] [gbkey=CDS]
# 3 lcl|NC_003888.3_cds_NP_624364.1_3   [locus_tag=SCO0003] [db_xref=GeneID:1095446]  [protein=DNA-binding protein] [protein_id=NP_624364.1] [location=3869..6220] [gbkey=CDS]
# 4 lcl|NC_003888.3_cds_NP_631871.1_4   [locus_tag=SCO0004] [db_xref=GeneID:1095445] [protein=hypothetical protein] [protein_id=NP_631871.1] [location=6226..7173] [gbkey=CDS]

这样,我们就可以清理它了:

library(dplyr)
rough %>%
  mutate(across(-name, ~ trimws(gsub("^[^=]*=|\]$", "", .))))
#                                 name locus_tag        db_xref              protein  protein_id   location gbkey
# 1 lcl|NC_003888.3_cds_NP_624362.1_1    SCO0001 GeneID:1095448 hypothetical protein NP_624362.1  446..1123   CDS
# 2 lcl|NC_003888.3_cds_NP_624363.1_2    SCO0002 GeneID:1095447 hypothetical protein NP_624363.1 1252..3813   CDS
# 3 lcl|NC_003888.3_cds_NP_624364.1_3    SCO0003 GeneID:1095446  DNA-binding protein NP_624364.1 3869..6220   CDS
# 4 lcl|NC_003888.3_cds_NP_631871.1_4    SCO0004 GeneID:1095445 hypothetical protein NP_631871.1 6226..7173   CDS

利用tidyr::extract我们可以做到:

d <- data.frame(
  seqDescriptions = seqDescriptions
)

tagList <- c("locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")
regex_tag <- lapply(tagList, function(.x) paste0("\[", .x, "=(.*)\]"))
regex_tag <- unlist(regex_tag)  
regex <- paste(c("^(\S+)?", regex_tag), collapse = "\s*")

library(tidyr)

d %>% 
  extract(seqDescriptions, into = c("name", tagList), regex)
#>                                name locus_tag        db_xref
#> 1 lcl|NC_003888.3_cds_NP_624362.1_1  SCO0001  GeneID:1095448
#> 2 lcl|NC_003888.3_cds_NP_624363.1_2   SCO0002 GeneID:1095447
#> 3 lcl|NC_003888.3_cds_NP_624364.1_3   SCO0003 GeneID:1095446
#> 4 lcl|NC_003888.3_cds_NP_631871.1_4   SCO0004 GeneID:1095445
#>                protein  protein_id   location gbkey
#> 1 hypothetical protein NP_624362.1  446..1123   CDS
#> 2 hypothetical protein NP_624363.1 1252..3813   CDS
#> 3  DNA-binding protein NP_624364.1 3869..6220   CDS
#> 4 hypothetical protein NP_631871.1 6226..7173   CDS

在不使用太多正则表达式的情况下,我们可以这样做:

seqDescriptions %>%
  chartr(':=[]', '=:  ', .)%>%
  strsplit(' +')%>%
  map_df(~cbind.data.frame(grp = .x[1],
              grep(':', .x, value = TRUE) %>%
              textConnection() %>%
              read.dcf(fields = tagList)%>%
                chartr('=', ':', .)))

                               grp locus_tag        db_xref      protein  protein_id   location gbkey
1 lcl|NC_003888.3_cds_NP_624362.1_1   SCO0001 GeneID:1095448 hypothetical NP_624362.1  446..1123   CDS
2 lcl|NC_003888.3_cds_NP_624363.1_2   SCO0002 GeneID:1095447 hypothetical NP_624363.1 1252..3813   CDS
3 lcl|NC_003888.3_cds_NP_624364.1_3   SCO0003 GeneID:1095446  DNA-binding NP_624364.1 3869..6220   CDS
4 lcl|NC_003888.3_cds_NP_631871.1_4   SCO0004 GeneID:1095445 hypothetical NP_631871.1 6226..7173   CDS