如何迭代函数中的条目以创建两个新的字符向量
How to iterate entries in a function to create two new character vectors
我正在努力将单个字符串输入分成一系列输入。用户给出了 FASTA 格式的序列列表(参见下面的示例)。我能够将输入分开
例如:
">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
.>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
"
[1] "Rosalind_6404CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
[2] "Rosalind_5959CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
但我正在努力寻找一种方法来创建一个函数,将“Rosalind_6404”从基因序列拆分为未知数量的 FASTA 序列,同时为拆分元素创建新的向量。
最终,结果将类似于:
.> "Rosalind_6404" "Rosalind5959"
.> "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG","CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
我希望 convert_entries 函数允许我遍历 prepped_s 字符向量的所有元素,并将元素拆分为两个具有相同索引号的新向量。
s <- ">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC"
split_s <- strsplit(s, ">")
ul_split_s<- unlist(split_s)
fixed_s <- gsub("\n","", ul_split_s)
prepped_s <- fixed_s[-1]
prepped_s
nchar(prepped_s[2])
print(prepped_s[2])
entry_tags <- list()
entry_seqs <- list()
entries <- length(prepped_s)
unlist(entries)
first <- prepped_s[1]
convert_entries <- function() {
for (i in entries) {
tag <- substr(prepped_s[i], start = 1, stop = 13)
entry_tags <- append(entry_tags, tag)
return(entry_tags)
}
}
entry_tags <- convert_entries()
print(entry_tags)
请尽一切可能帮助,谢谢!
一个选项 tidyverse
library(dplyr)
library(tidyr)
library(stringr)
tibble(col1 = s) %>%
separate_rows(col1, sep="\n") %>%
group_by(grp = cumsum(str_detect(col1, '^>'))) %>%
summarise(prefix = first(col1),
col1 = str_c(col1[-1], collapse=""), .groups = 'drop') %>%
select(-grp)
-输出
# A tibble: 2 x 2
prefix col1
<chr> <chr>
1 >Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 >Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
在基础 R 中你可以这样做:
t(gsub('\n', '', regmatches(s, gregexec("([A-Z][a-z_0-9]+)\n([A-Z\n]+)", s))[[1]][-1,]))
[,1] [,2]
[1,] "Rosalind_6404" "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
[2,] "Rosalind_5959" "CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
注意:我转置了矩阵,以便您可以查看结果。忽略t
函数
的使用
另一个基础 R 解决方案:
read.table(text=sub('\n', ' ', gsub('(\D)\n', '\1', unlist(strsplit(s, '>')))))
V1 V2
1 Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
甚至
proto <- data.frame(name = character(), value = character())
new_s <- gsub('\n', '', unlist(strsplit(s, '>')))
strcapture("([A-Z][a-z_0-9]+)([A-Z]+)", grep('\w', new_s, value = T), proto)
name value
1 Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
library(seqinr)
# example fasta file
write(">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC", "myFile.fasta")
# read the fasta file
x <- read.fasta("myFile.fasta", as.string = TRUE, forceDNAtolower = FALSE)
# get the names
names(x)
# [1] "Rosalind_6404" "Rosalind_5959"
# get the seq
x$Rosalind_6404
# [1] "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
# attr(,"name")
# [1] "Rosalind_6404"
# attr(,"Annot")
# [1] ">Rosalind_6404"
# attr(,"class")
# [1] "SeqFastadna"
我正在努力将单个字符串输入分成一系列输入。用户给出了 FASTA 格式的序列列表(参见下面的示例)。我能够将输入分开
例如:
">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
.>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
"
[1] "Rosalind_6404CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
[2] "Rosalind_5959CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
但我正在努力寻找一种方法来创建一个函数,将“Rosalind_6404”从基因序列拆分为未知数量的 FASTA 序列,同时为拆分元素创建新的向量。 最终,结果将类似于:
.> "Rosalind_6404" "Rosalind5959"
.> "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG","CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
我希望 convert_entries 函数允许我遍历 prepped_s 字符向量的所有元素,并将元素拆分为两个具有相同索引号的新向量。
s <- ">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC"
split_s <- strsplit(s, ">")
ul_split_s<- unlist(split_s)
fixed_s <- gsub("\n","", ul_split_s)
prepped_s <- fixed_s[-1]
prepped_s
nchar(prepped_s[2])
print(prepped_s[2])
entry_tags <- list()
entry_seqs <- list()
entries <- length(prepped_s)
unlist(entries)
first <- prepped_s[1]
convert_entries <- function() {
for (i in entries) {
tag <- substr(prepped_s[i], start = 1, stop = 13)
entry_tags <- append(entry_tags, tag)
return(entry_tags)
}
}
entry_tags <- convert_entries()
print(entry_tags)
请尽一切可能帮助,谢谢!
一个选项 tidyverse
library(dplyr)
library(tidyr)
library(stringr)
tibble(col1 = s) %>%
separate_rows(col1, sep="\n") %>%
group_by(grp = cumsum(str_detect(col1, '^>'))) %>%
summarise(prefix = first(col1),
col1 = str_c(col1[-1], collapse=""), .groups = 'drop') %>%
select(-grp)
-输出
# A tibble: 2 x 2
prefix col1
<chr> <chr>
1 >Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 >Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
在基础 R 中你可以这样做:
t(gsub('\n', '', regmatches(s, gregexec("([A-Z][a-z_0-9]+)\n([A-Z\n]+)", s))[[1]][-1,]))
[,1] [,2]
[1,] "Rosalind_6404" "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
[2,] "Rosalind_5959" "CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
注意:我转置了矩阵,以便您可以查看结果。忽略t
函数
另一个基础 R 解决方案:
read.table(text=sub('\n', ' ', gsub('(\D)\n', '\1', unlist(strsplit(s, '>')))))
V1 V2
1 Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
甚至
proto <- data.frame(name = character(), value = character())
new_s <- gsub('\n', '', unlist(strsplit(s, '>')))
strcapture("([A-Z][a-z_0-9]+)([A-Z]+)", grep('\w', new_s, value = T), proto)
name value
1 Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
library(seqinr)
# example fasta file
write(">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC", "myFile.fasta")
# read the fasta file
x <- read.fasta("myFile.fasta", as.string = TRUE, forceDNAtolower = FALSE)
# get the names
names(x)
# [1] "Rosalind_6404" "Rosalind_5959"
# get the seq
x$Rosalind_6404
# [1] "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
# attr(,"name")
# [1] "Rosalind_6404"
# attr(,"Annot")
# [1] ">Rosalind_6404"
# attr(,"class")
# [1] "SeqFastadna"