使用正则表达式和基于匹配部分的列名将字符串列拆分为字符串矩阵
Split column of strings into matrix of strings using regex with colnames based on part of match
我正在根据蛋白质编码序列的 FASTA 文件中的基因描述创建 tibble。这是我要处理的一些示例数据
seqDescriptions<- c( "lcl|NC_003888.3_cds_NP_624362.1_1 [locus_tag=SCO0001 ][db_xref=GeneID:1095448] [protein=hypothetical protein] [protein_id=NP_624362.1] [location=446..1123] [gbkey=CDS]",
"lcl|NC_003888.3_cds_NP_624363.1_2 [locus_tag=SCO0002] [db_xref=GeneID:1095447] [protein=hypothetical protein] [protein_id=NP_624363.1] [location=1252..3813] [gbkey=CDS]",
"lcl|NC_003888.3_cds_NP_624364.1_3 [locus_tag=SCO0003] [db_xref=GeneID:1095446] [protein=DNA-binding protein] [protein_id=NP_624364.1] [location=3869..6220] [gbkey=CDS]",
"lcl|NC_003888.3_cds_NP_631871.1_4 [locus_tag=SCO0004] [db_xref=GeneID:1095445] [protein=hypothetical protein] [protein_id=NP_631871.1] [location=6226..7173] [gbkey=CDS]")
我想提取出一列中的初始非space 字符集,然后提取每个标签右侧的信息。手动定义标签
tagList <- c("locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")
我的目标是拥有一个像这样的 tibble
# A tibble: 4 x 7
name locus_tag db_xref protein ...
<chr> <chr> <chr> <chr> ...
"lcl|NC_003888.3_cds_NP_624362.1_1" "SCO0001" "GeneID:1095448" "hypothetical protein" ...
"lcl|NC_003888.3_cds_NP_624363.1_2" "SCO0002" "GeneID:1095447" "hypothetical protein" ...
下面的代码有效,但我想
- 查看如何以
tidyr
方式实施。
- 让列在构造时使用
tag
的值命名,而不是事后命名。
- 了解可以更直接地执行此操作的任何生物信息学工具,例如无需手动定义标签。
.
fastaID <- sub("^(\S+) .*", "\1", seqDescriptions)
seqTags <- sub("^\S+ (.*)", "\1", seqDescriptions)
dBase <- tibble(fasta_ID=fastaID)
for(tag in tagsUsed){
tagPattern <- paste0(".*\[", tag, "=([^]]+).*")## Don't need to escape ']' with '^'
dBase <- tibble::add_column(dBase, sub(tagPattern, "\1", seqTags), .name_repair="unique" )
}
names(dBase) <- c("fasta_ID", tagsUsed)
tibble(tagsUsed))
这是一种混合方法:
# add "name" for the first column
tagList <- c("name", "locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")
rough <- strcapture("^([^]]+)\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])",
seqDescriptions, proto=setNames(rep("", length(tagList)), tagList))
rough
# name locus_tag db_xref protein protein_id location gbkey
# 1 lcl|NC_003888.3_cds_NP_624362.1_1 [locus_tag=SCO0001 ] [db_xref=GeneID:1095448] [protein=hypothetical protein] [protein_id=NP_624362.1] [location=446..1123] [gbkey=CDS]
# 2 lcl|NC_003888.3_cds_NP_624363.1_2 [locus_tag=SCO0002] [db_xref=GeneID:1095447] [protein=hypothetical protein] [protein_id=NP_624363.1] [location=1252..3813] [gbkey=CDS]
# 3 lcl|NC_003888.3_cds_NP_624364.1_3 [locus_tag=SCO0003] [db_xref=GeneID:1095446] [protein=DNA-binding protein] [protein_id=NP_624364.1] [location=3869..6220] [gbkey=CDS]
# 4 lcl|NC_003888.3_cds_NP_631871.1_4 [locus_tag=SCO0004] [db_xref=GeneID:1095445] [protein=hypothetical protein] [protein_id=NP_631871.1] [location=6226..7173] [gbkey=CDS]
这样,我们就可以清理它了:
library(dplyr)
rough %>%
mutate(across(-name, ~ trimws(gsub("^[^=]*=|\]$", "", .))))
# name locus_tag db_xref protein protein_id location gbkey
# 1 lcl|NC_003888.3_cds_NP_624362.1_1 SCO0001 GeneID:1095448 hypothetical protein NP_624362.1 446..1123 CDS
# 2 lcl|NC_003888.3_cds_NP_624363.1_2 SCO0002 GeneID:1095447 hypothetical protein NP_624363.1 1252..3813 CDS
# 3 lcl|NC_003888.3_cds_NP_624364.1_3 SCO0003 GeneID:1095446 DNA-binding protein NP_624364.1 3869..6220 CDS
# 4 lcl|NC_003888.3_cds_NP_631871.1_4 SCO0004 GeneID:1095445 hypothetical protein NP_631871.1 6226..7173 CDS
利用tidyr::extract
我们可以做到:
d <- data.frame(
seqDescriptions = seqDescriptions
)
tagList <- c("locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")
regex_tag <- lapply(tagList, function(.x) paste0("\[", .x, "=(.*)\]"))
regex_tag <- unlist(regex_tag)
regex <- paste(c("^(\S+)?", regex_tag), collapse = "\s*")
library(tidyr)
d %>%
extract(seqDescriptions, into = c("name", tagList), regex)
#> name locus_tag db_xref
#> 1 lcl|NC_003888.3_cds_NP_624362.1_1 SCO0001 GeneID:1095448
#> 2 lcl|NC_003888.3_cds_NP_624363.1_2 SCO0002 GeneID:1095447
#> 3 lcl|NC_003888.3_cds_NP_624364.1_3 SCO0003 GeneID:1095446
#> 4 lcl|NC_003888.3_cds_NP_631871.1_4 SCO0004 GeneID:1095445
#> protein protein_id location gbkey
#> 1 hypothetical protein NP_624362.1 446..1123 CDS
#> 2 hypothetical protein NP_624363.1 1252..3813 CDS
#> 3 DNA-binding protein NP_624364.1 3869..6220 CDS
#> 4 hypothetical protein NP_631871.1 6226..7173 CDS
在不使用太多正则表达式的情况下,我们可以这样做:
seqDescriptions %>%
chartr(':=[]', '=: ', .)%>%
strsplit(' +')%>%
map_df(~cbind.data.frame(grp = .x[1],
grep(':', .x, value = TRUE) %>%
textConnection() %>%
read.dcf(fields = tagList)%>%
chartr('=', ':', .)))
grp locus_tag db_xref protein protein_id location gbkey
1 lcl|NC_003888.3_cds_NP_624362.1_1 SCO0001 GeneID:1095448 hypothetical NP_624362.1 446..1123 CDS
2 lcl|NC_003888.3_cds_NP_624363.1_2 SCO0002 GeneID:1095447 hypothetical NP_624363.1 1252..3813 CDS
3 lcl|NC_003888.3_cds_NP_624364.1_3 SCO0003 GeneID:1095446 DNA-binding NP_624364.1 3869..6220 CDS
4 lcl|NC_003888.3_cds_NP_631871.1_4 SCO0004 GeneID:1095445 hypothetical NP_631871.1 6226..7173 CDS
我正在根据蛋白质编码序列的 FASTA 文件中的基因描述创建 tibble。这是我要处理的一些示例数据
seqDescriptions<- c( "lcl|NC_003888.3_cds_NP_624362.1_1 [locus_tag=SCO0001 ][db_xref=GeneID:1095448] [protein=hypothetical protein] [protein_id=NP_624362.1] [location=446..1123] [gbkey=CDS]",
"lcl|NC_003888.3_cds_NP_624363.1_2 [locus_tag=SCO0002] [db_xref=GeneID:1095447] [protein=hypothetical protein] [protein_id=NP_624363.1] [location=1252..3813] [gbkey=CDS]",
"lcl|NC_003888.3_cds_NP_624364.1_3 [locus_tag=SCO0003] [db_xref=GeneID:1095446] [protein=DNA-binding protein] [protein_id=NP_624364.1] [location=3869..6220] [gbkey=CDS]",
"lcl|NC_003888.3_cds_NP_631871.1_4 [locus_tag=SCO0004] [db_xref=GeneID:1095445] [protein=hypothetical protein] [protein_id=NP_631871.1] [location=6226..7173] [gbkey=CDS]")
我想提取出一列中的初始非space 字符集,然后提取每个标签右侧的信息。手动定义标签
tagList <- c("locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")
我的目标是拥有一个像这样的 tibble
# A tibble: 4 x 7
name locus_tag db_xref protein ...
<chr> <chr> <chr> <chr> ...
"lcl|NC_003888.3_cds_NP_624362.1_1" "SCO0001" "GeneID:1095448" "hypothetical protein" ...
"lcl|NC_003888.3_cds_NP_624363.1_2" "SCO0002" "GeneID:1095447" "hypothetical protein" ...
下面的代码有效,但我想
- 查看如何以
tidyr
方式实施。 - 让列在构造时使用
tag
的值命名,而不是事后命名。 - 了解可以更直接地执行此操作的任何生物信息学工具,例如无需手动定义标签。
.
fastaID <- sub("^(\S+) .*", "\1", seqDescriptions)
seqTags <- sub("^\S+ (.*)", "\1", seqDescriptions)
dBase <- tibble(fasta_ID=fastaID)
for(tag in tagsUsed){
tagPattern <- paste0(".*\[", tag, "=([^]]+).*")## Don't need to escape ']' with '^'
dBase <- tibble::add_column(dBase, sub(tagPattern, "\1", seqTags), .name_repair="unique" )
}
names(dBase) <- c("fasta_ID", tagsUsed)
tibble(tagsUsed))
这是一种混合方法:
# add "name" for the first column
tagList <- c("name", "locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")
rough <- strcapture("^([^]]+)\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])\s*(\[[^]]+\])",
seqDescriptions, proto=setNames(rep("", length(tagList)), tagList))
rough
# name locus_tag db_xref protein protein_id location gbkey
# 1 lcl|NC_003888.3_cds_NP_624362.1_1 [locus_tag=SCO0001 ] [db_xref=GeneID:1095448] [protein=hypothetical protein] [protein_id=NP_624362.1] [location=446..1123] [gbkey=CDS]
# 2 lcl|NC_003888.3_cds_NP_624363.1_2 [locus_tag=SCO0002] [db_xref=GeneID:1095447] [protein=hypothetical protein] [protein_id=NP_624363.1] [location=1252..3813] [gbkey=CDS]
# 3 lcl|NC_003888.3_cds_NP_624364.1_3 [locus_tag=SCO0003] [db_xref=GeneID:1095446] [protein=DNA-binding protein] [protein_id=NP_624364.1] [location=3869..6220] [gbkey=CDS]
# 4 lcl|NC_003888.3_cds_NP_631871.1_4 [locus_tag=SCO0004] [db_xref=GeneID:1095445] [protein=hypothetical protein] [protein_id=NP_631871.1] [location=6226..7173] [gbkey=CDS]
这样,我们就可以清理它了:
library(dplyr)
rough %>%
mutate(across(-name, ~ trimws(gsub("^[^=]*=|\]$", "", .))))
# name locus_tag db_xref protein protein_id location gbkey
# 1 lcl|NC_003888.3_cds_NP_624362.1_1 SCO0001 GeneID:1095448 hypothetical protein NP_624362.1 446..1123 CDS
# 2 lcl|NC_003888.3_cds_NP_624363.1_2 SCO0002 GeneID:1095447 hypothetical protein NP_624363.1 1252..3813 CDS
# 3 lcl|NC_003888.3_cds_NP_624364.1_3 SCO0003 GeneID:1095446 DNA-binding protein NP_624364.1 3869..6220 CDS
# 4 lcl|NC_003888.3_cds_NP_631871.1_4 SCO0004 GeneID:1095445 hypothetical protein NP_631871.1 6226..7173 CDS
利用tidyr::extract
我们可以做到:
d <- data.frame(
seqDescriptions = seqDescriptions
)
tagList <- c("locus_tag", "db_xref", "protein", "protein_id", "location", "gbkey")
regex_tag <- lapply(tagList, function(.x) paste0("\[", .x, "=(.*)\]"))
regex_tag <- unlist(regex_tag)
regex <- paste(c("^(\S+)?", regex_tag), collapse = "\s*")
library(tidyr)
d %>%
extract(seqDescriptions, into = c("name", tagList), regex)
#> name locus_tag db_xref
#> 1 lcl|NC_003888.3_cds_NP_624362.1_1 SCO0001 GeneID:1095448
#> 2 lcl|NC_003888.3_cds_NP_624363.1_2 SCO0002 GeneID:1095447
#> 3 lcl|NC_003888.3_cds_NP_624364.1_3 SCO0003 GeneID:1095446
#> 4 lcl|NC_003888.3_cds_NP_631871.1_4 SCO0004 GeneID:1095445
#> protein protein_id location gbkey
#> 1 hypothetical protein NP_624362.1 446..1123 CDS
#> 2 hypothetical protein NP_624363.1 1252..3813 CDS
#> 3 DNA-binding protein NP_624364.1 3869..6220 CDS
#> 4 hypothetical protein NP_631871.1 6226..7173 CDS
在不使用太多正则表达式的情况下,我们可以这样做:
seqDescriptions %>%
chartr(':=[]', '=: ', .)%>%
strsplit(' +')%>%
map_df(~cbind.data.frame(grp = .x[1],
grep(':', .x, value = TRUE) %>%
textConnection() %>%
read.dcf(fields = tagList)%>%
chartr('=', ':', .)))
grp locus_tag db_xref protein protein_id location gbkey
1 lcl|NC_003888.3_cds_NP_624362.1_1 SCO0001 GeneID:1095448 hypothetical NP_624362.1 446..1123 CDS
2 lcl|NC_003888.3_cds_NP_624363.1_2 SCO0002 GeneID:1095447 hypothetical NP_624363.1 1252..3813 CDS
3 lcl|NC_003888.3_cds_NP_624364.1_3 SCO0003 GeneID:1095446 DNA-binding NP_624364.1 3869..6220 CDS
4 lcl|NC_003888.3_cds_NP_631871.1_4 SCO0004 GeneID:1095445 hypothetical NP_631871.1 6226..7173 CDS