通过子模式子集 DNAStringSet 并删除 R 中的子模式
subset a DNAStringSet by subpattern and remove subpattern in R
我只想对包含子字符串的行进行子集化,然后删除子字符串。我可以做第一部分,但我不知道如何删除子字符串
这是一个例子
library(Biostrings)
myseq <-DNAStringSet(c("CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA", "CCCATGAACATAGATCC", "CCCGTACAGATCACGTG"))
names(myseq) <- letters[1:3]
myseq
A DNAStringSet instance of length 3
width seq names
[1] 40 CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA a
[2] 17 CCCATGAACATAGATCC b
[3] 17 CCCGTACAGATCACGTG c
我要删除的序列是第一行的 AGATCGGAAGAGCACACGTCTGAA。
matchPattern("AGATCGGAAGAGCACACGTCTGAA", myseq[[1]])
Views on a 40-letter DNAString subject
subject: CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA
views:
start end width
[1] 9 32 24 [AGATCGGAAGAGCACACGTCTGAA]
我对子集执行以下操作:
pat <- vmatchPattern("AGATCGGAAGAGCACACGTCTGAA", myseq)
myseq[ lapply(lapply(pat, isEmpty), function(x) x == FALSE) ]
A DNAStringSet instance of length 3
width seq names
[1] 40 CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA a
[2] 0 b
[3] 0 c
输出应该是
A DNAStringSet instance of length 3
width seq names
[1] 11 CCCCCCATGAA a
[2] 0 b
[3] 0 c
我不熟悉bioinformatics包,但是如果你可以将数据转换成列表(我相信应该可以将列表转换成包中使用的格式),下面的方法可以使用:
1) 使用 stringr 库删除所需的模式
2) 计算新模式的长度
# load biostrings package
library(Biostrings)
# create sample dataset
myseq <-DNAStringSet(c("CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA", "CCCATGAACATAGATCC", "CCCGTACAGATCACGTG"))
names(myseq) <- letters[1:3]
# remove sequences with no match
pat <- vmatchPattern("AGATCGGAAGAGCACACGTCTGAA", myseq)
data <- myseq[ lapply(lapply(pat, isEmpty), function(x) x == FALSE) ]
# load stringr library
library(stringr)
# replace the matched sequence
test <- lapply(test, str_replace, "AGATCGGAAGAGCACACGTCTGAA", "")
# put together the new sequence and its length
test <- mapply(c, lapply(test, nchar), test, SIMPLIFY = FALSE)
您可以使用 vcountPattern
计算 ifelse
语句中的匹配项,用 str_replace
和 non-matches 的输出替换匹配项为空字符串:
myseq2 <- DNAStringSet(
unlist(
lapply(
vcountPattern(
'AGATCGGAAGAGCACACGTCTGAA', myseq) > 0,
ifelse,
str_replace(
myseq,
'AGATCGGAAGAGCACACGTCTGAA',
''),
'')
)
)
names(myseq2) <- names(myseq)
myseq2
>A DNAStringSet instance of length 3
>width seq names
>[1] 16 CCCATGAACCCATGAA a
>[2] 0 b
>[3] 0 c
使用竖线表示法可读性更高:
lapply(vcountPattern('AGATCGGAAGAGCACACGTCTGAA', myseq) > 0, ifelse, str_replace(myseq, 'AGATCGGAAGAGCACACGTCTGAA', ''), '') %>%
unlist() %>%
DNAStringSet() -> myseq2
我只想对包含子字符串的行进行子集化,然后删除子字符串。我可以做第一部分,但我不知道如何删除子字符串
这是一个例子
library(Biostrings)
myseq <-DNAStringSet(c("CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA", "CCCATGAACATAGATCC", "CCCGTACAGATCACGTG"))
names(myseq) <- letters[1:3]
myseq
A DNAStringSet instance of length 3
width seq names
[1] 40 CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA a
[2] 17 CCCATGAACATAGATCC b
[3] 17 CCCGTACAGATCACGTG c
我要删除的序列是第一行的 AGATCGGAAGAGCACACGTCTGAA。
matchPattern("AGATCGGAAGAGCACACGTCTGAA", myseq[[1]])
Views on a 40-letter DNAString subject
subject: CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA
views:
start end width
[1] 9 32 24 [AGATCGGAAGAGCACACGTCTGAA]
我对子集执行以下操作:
pat <- vmatchPattern("AGATCGGAAGAGCACACGTCTGAA", myseq)
myseq[ lapply(lapply(pat, isEmpty), function(x) x == FALSE) ]
A DNAStringSet instance of length 3
width seq names
[1] 40 CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA a
[2] 0 b
[3] 0 c
输出应该是
A DNAStringSet instance of length 3
width seq names
[1] 11 CCCCCCATGAA a
[2] 0 b
[3] 0 c
我不熟悉bioinformatics包,但是如果你可以将数据转换成列表(我相信应该可以将列表转换成包中使用的格式),下面的方法可以使用:
1) 使用 stringr 库删除所需的模式 2) 计算新模式的长度
# load biostrings package
library(Biostrings)
# create sample dataset
myseq <-DNAStringSet(c("CCCATGAAAGATCGGAAGAGCACACGTCTGAACCCATGAA", "CCCATGAACATAGATCC", "CCCGTACAGATCACGTG"))
names(myseq) <- letters[1:3]
# remove sequences with no match
pat <- vmatchPattern("AGATCGGAAGAGCACACGTCTGAA", myseq)
data <- myseq[ lapply(lapply(pat, isEmpty), function(x) x == FALSE) ]
# load stringr library
library(stringr)
# replace the matched sequence
test <- lapply(test, str_replace, "AGATCGGAAGAGCACACGTCTGAA", "")
# put together the new sequence and its length
test <- mapply(c, lapply(test, nchar), test, SIMPLIFY = FALSE)
您可以使用 vcountPattern
计算 ifelse
语句中的匹配项,用 str_replace
和 non-matches 的输出替换匹配项为空字符串:
myseq2 <- DNAStringSet(
unlist(
lapply(
vcountPattern(
'AGATCGGAAGAGCACACGTCTGAA', myseq) > 0,
ifelse,
str_replace(
myseq,
'AGATCGGAAGAGCACACGTCTGAA',
''),
'')
)
)
names(myseq2) <- names(myseq)
myseq2
>A DNAStringSet instance of length 3
>width seq names
>[1] 16 CCCATGAACCCATGAA a
>[2] 0 b
>[3] 0 c
使用竖线表示法可读性更高:
lapply(vcountPattern('AGATCGGAAGAGCACACGTCTGAA', myseq) > 0, ifelse, str_replace(myseq, 'AGATCGGAAGAGCACACGTCTGAA', ''), '') %>%
unlist() %>%
DNAStringSet() -> myseq2