如何修复 R 中的寡核苷酸频率错误
How to fix oligonucleotideFrequency error in R
我对包 "seqinr" 的 read.fasta 功能有疑问。当我将它与 lapply 一起使用时,它不会创建所需的向量。
此外,当我对手动构建的向量使用函数计数时,结果是 table 个零。
这是我的代码:
library("seqinr")
library(MASS)
#GETTING THE FILES AFTER FRAGMENTS OF 500
files <- list.files(path="/Users/CamilaMV/Desktop/TESIS/", pattern=".fna500mer..split", full.names=T, recursive=FALSE)
files
# SOLO ESTA TOMANDO EL PRIMER ARCHIVO
#READING THE DIFFERENT FASTA FILES
ncrna <- lapply(files, function(x) { read.fasta(x,seqonly = T) })
seqs<-list()
for(i in seq_along(ncrna))
{
seqs[i]<-list(ncrna[[i]])
}
len1<-length(seqs[[1]])
frags1<-list()
for(j in 1:len1)
{
frags1[j]<-list(seqs[[1]][[j]])
}
frags1
#COUNTING TRETRANUCLEOTIDES FOR EACH FRAGMENT
tetra_frag1<-list()
# seq_along(frags1)
#frags1[[1]]
for(l in seq_along(frags1))
{
#tetra[i]<-list(count(ncra[[i]],4))
tetra_frag1[l]<-oligonucleotideFrequency(frags1[[l]],4)
}
我以前做的时候,计数功能可以,但现在不能正常工作了。
然后,我决定使用 oligonucletideFrequency 函数,但它给了我以下错误:
错误 (函数 (类, fdef, mtable) :
无法为签名“"character"”
找到函数“oligonucleotideFrequency”的继承方法
但是当我用is.character(frags1[[1]])作为测试时,结果是真的
我想获得一个具有寡核苷酸频率的矩阵来执行 PCA。
我想要一个最终的 table,其中列是四核苷酸的 256 种组合,行是片段的名称(例如 frag1、frag2、...),如下所示:
aaaa aaac ...
f1 3 5
f2 4 6
f3 5 7
...
非常感谢您的帮助。
我可以解决第一个问题和其他问题。最后,我有一个包含 4 个函数的 R 脚本,这些函数生成了一个 RGB 向量列表。
# GETTING LIBRARIES
library("seqinr")
library("ade4")
library("Biostrings")
## funcion 1
Processing_fragments<-function(PATH_FILES){
#GETTING THE FILES AFTER FRAGMENTS OF 500
files <- list.files(path=PATH_FILES, pattern=".fna500mer", full.names=T, recursive=FALSE)
#GETTING THE FILES READING AS FASTA
ncrna <- lapply(files, function(x) { read.fasta(x,seqonly = T) })
fragmentsGeno1<-list()
for(k in seq_along(ncrna[1]))
{
for(l in 1:10484)
{
fragmentsGeno1[l]<-ncrna[[k]][[l]]
}
}
fragmentsGeno2<-list()
for(k in seq_along(ncrna[2]))
{
for(l in 1:length(ncrna[[2]]))
{
fragmentsGeno2[l]<-ncrna[[k]][[l]]
}
}
#GETTING ALL FRAGMENTS
allFragments<-c(fragmentsGeno1,fragmentsGeno2)
return(allFragments)
}
## funcion 2
Getting_frequency_account<-function(allFragments,kmer){
#CONVERTING LOS FRAGMENTOS DE CADA FILE A OBJETOS DE DNAString
DNA_String_Set_list_ALL<-list()
for(i in seq_along(allFragments))
{
DNA_String_Set_list_ALL[i]<-DNAStringSet(allFragments[[i]])
}
# counting oligonucleotide
countGenome1_Tetra<-lapply(DNA_String_Set_list_ALL,function(x) {oligonucleotideFrequency((x),kmer, as.prob = T) })
# MATRIX FOR THE PCA
#names columns
col_names<-dimnames(countGenome1_Tetra[[1]])
col_names<-col_names[[2]]
#names rows
frag_names<-c(paste("frag",c(1:length(allFragments)),sep=""))
#matrix for PCA
matrix_PCA<-matrix(unlist(countGenome1_Tetra),nrow = length(allFragments),ncol=256,byrow = T,dimnames=list(frag_names,col_names))
return(matrix_PCA)
}
# View(matrix_PCA)
## funcion 3
Getting_first_three_components<-function(matrix_PCA){
######## PCA with prcomp#########
prcomp_All<-prcomp(matrix_PCA)
#obtaing the sum of varianza of the first three components
Var<-prcomp_All$sdev^2 / sum(prcomp_All$sdev^2)
Varianza_3_first_comp<-Var[1:3]
Varianza_3_first_comp_Porcent<-Varianza_3_first_comp*100
Suma_total<-sum(Varianza_3_first_comp_Porcent)
## obteniendo eigen of first three components
loadings_prcomp<-prcomp_All$x
#dim(loadings_prcomp)
First_three_components<-loadings_prcomp[,c(1,2,3)]
return(First_three_components)
}
#funcion 4
Generating_hex_color_codes<-function(First_three_components){
# getting min and max
min<-min(First_three_components)
max<-max(First_three_components)
# getting ranges
range_2_color<-c(min,max)
range_RGB_color<-c(0,1)
#making linear regression
lm.out<-lm(range_RGB_color~range_2_color)
#getting slope and intercept
slope<-lm.out$coefficients[2]
intercept<-lm.out$coefficients[1]
#normalizing pca results to RGB
new_Matriz<-(First_three_components*slope)+intercept
new_Matriz<-as.matrix(new_Matriz)
#using funcion rgb to generate matrix of hex color code
#hex_Color_Matriz<-t(mapply(rgb, split(new_Matriz[,1], new_Matriz[,2],new_Matriz[,3],maxColorValue=255)))
hex_Color_Vector<-vector()
# list de cada r,g,b de cada fragmento
rgb_List_Each_Fragment<-list()
row_Final<-length(new_Matriz[,1])
columns_Final<-length(new_Matriz[1,])
for(i in 1:row_Final){
for(j in 1:columns_Final){
red<-new_Matriz[i,1]
green<-new_Matriz[i,2]
blue<-new_Matriz[i,3]
hex_Color_Vector[i]<-rgb(red,green,blue,maxColorValue = 1)
rgb_List_Each_Fragment[i]<-list(c(red,green,blue))
}
}
return(rgb_List_Each_Fragment)
}
# Calling all the funcionts in order
allFragments<-Processing_fragments("/Users/CamilaMV/Desktop/TESIS")
matrix_PCA<-Getting_frequency_account(allFragments,4)
First_three_components<-Getting_first_three_components(matrix_PCA)
Hex_color_list<-Generating_hex_color_codes(First_three_components)
我对包 "seqinr" 的 read.fasta 功能有疑问。当我将它与 lapply 一起使用时,它不会创建所需的向量。
此外,当我对手动构建的向量使用函数计数时,结果是 table 个零。
这是我的代码:
library("seqinr")
library(MASS)
#GETTING THE FILES AFTER FRAGMENTS OF 500
files <- list.files(path="/Users/CamilaMV/Desktop/TESIS/", pattern=".fna500mer..split", full.names=T, recursive=FALSE)
files
# SOLO ESTA TOMANDO EL PRIMER ARCHIVO
#READING THE DIFFERENT FASTA FILES
ncrna <- lapply(files, function(x) { read.fasta(x,seqonly = T) })
seqs<-list()
for(i in seq_along(ncrna))
{
seqs[i]<-list(ncrna[[i]])
}
len1<-length(seqs[[1]])
frags1<-list()
for(j in 1:len1)
{
frags1[j]<-list(seqs[[1]][[j]])
}
frags1
#COUNTING TRETRANUCLEOTIDES FOR EACH FRAGMENT
tetra_frag1<-list()
# seq_along(frags1)
#frags1[[1]]
for(l in seq_along(frags1))
{
#tetra[i]<-list(count(ncra[[i]],4))
tetra_frag1[l]<-oligonucleotideFrequency(frags1[[l]],4)
}
我以前做的时候,计数功能可以,但现在不能正常工作了。
然后,我决定使用 oligonucletideFrequency 函数,但它给了我以下错误:
错误 (函数 (类, fdef, mtable) : 无法为签名“"character"”
找到函数“oligonucleotideFrequency”的继承方法但是当我用is.character(frags1[[1]])作为测试时,结果是真的
我想获得一个具有寡核苷酸频率的矩阵来执行 PCA。
我想要一个最终的 table,其中列是四核苷酸的 256 种组合,行是片段的名称(例如 frag1、frag2、...),如下所示:
aaaa aaac ... f1 3 5 f2 4 6 f3 5 7 ...
非常感谢您的帮助。
我可以解决第一个问题和其他问题。最后,我有一个包含 4 个函数的 R 脚本,这些函数生成了一个 RGB 向量列表。
# GETTING LIBRARIES
library("seqinr")
library("ade4")
library("Biostrings")
## funcion 1
Processing_fragments<-function(PATH_FILES){
#GETTING THE FILES AFTER FRAGMENTS OF 500
files <- list.files(path=PATH_FILES, pattern=".fna500mer", full.names=T, recursive=FALSE)
#GETTING THE FILES READING AS FASTA
ncrna <- lapply(files, function(x) { read.fasta(x,seqonly = T) })
fragmentsGeno1<-list()
for(k in seq_along(ncrna[1]))
{
for(l in 1:10484)
{
fragmentsGeno1[l]<-ncrna[[k]][[l]]
}
}
fragmentsGeno2<-list()
for(k in seq_along(ncrna[2]))
{
for(l in 1:length(ncrna[[2]]))
{
fragmentsGeno2[l]<-ncrna[[k]][[l]]
}
}
#GETTING ALL FRAGMENTS
allFragments<-c(fragmentsGeno1,fragmentsGeno2)
return(allFragments)
}
## funcion 2
Getting_frequency_account<-function(allFragments,kmer){
#CONVERTING LOS FRAGMENTOS DE CADA FILE A OBJETOS DE DNAString
DNA_String_Set_list_ALL<-list()
for(i in seq_along(allFragments))
{
DNA_String_Set_list_ALL[i]<-DNAStringSet(allFragments[[i]])
}
# counting oligonucleotide
countGenome1_Tetra<-lapply(DNA_String_Set_list_ALL,function(x) {oligonucleotideFrequency((x),kmer, as.prob = T) })
# MATRIX FOR THE PCA
#names columns
col_names<-dimnames(countGenome1_Tetra[[1]])
col_names<-col_names[[2]]
#names rows
frag_names<-c(paste("frag",c(1:length(allFragments)),sep=""))
#matrix for PCA
matrix_PCA<-matrix(unlist(countGenome1_Tetra),nrow = length(allFragments),ncol=256,byrow = T,dimnames=list(frag_names,col_names))
return(matrix_PCA)
}
# View(matrix_PCA)
## funcion 3
Getting_first_three_components<-function(matrix_PCA){
######## PCA with prcomp#########
prcomp_All<-prcomp(matrix_PCA)
#obtaing the sum of varianza of the first three components
Var<-prcomp_All$sdev^2 / sum(prcomp_All$sdev^2)
Varianza_3_first_comp<-Var[1:3]
Varianza_3_first_comp_Porcent<-Varianza_3_first_comp*100
Suma_total<-sum(Varianza_3_first_comp_Porcent)
## obteniendo eigen of first three components
loadings_prcomp<-prcomp_All$x
#dim(loadings_prcomp)
First_three_components<-loadings_prcomp[,c(1,2,3)]
return(First_three_components)
}
#funcion 4
Generating_hex_color_codes<-function(First_three_components){
# getting min and max
min<-min(First_three_components)
max<-max(First_three_components)
# getting ranges
range_2_color<-c(min,max)
range_RGB_color<-c(0,1)
#making linear regression
lm.out<-lm(range_RGB_color~range_2_color)
#getting slope and intercept
slope<-lm.out$coefficients[2]
intercept<-lm.out$coefficients[1]
#normalizing pca results to RGB
new_Matriz<-(First_three_components*slope)+intercept
new_Matriz<-as.matrix(new_Matriz)
#using funcion rgb to generate matrix of hex color code
#hex_Color_Matriz<-t(mapply(rgb, split(new_Matriz[,1], new_Matriz[,2],new_Matriz[,3],maxColorValue=255)))
hex_Color_Vector<-vector()
# list de cada r,g,b de cada fragmento
rgb_List_Each_Fragment<-list()
row_Final<-length(new_Matriz[,1])
columns_Final<-length(new_Matriz[1,])
for(i in 1:row_Final){
for(j in 1:columns_Final){
red<-new_Matriz[i,1]
green<-new_Matriz[i,2]
blue<-new_Matriz[i,3]
hex_Color_Vector[i]<-rgb(red,green,blue,maxColorValue = 1)
rgb_List_Each_Fragment[i]<-list(c(red,green,blue))
}
}
return(rgb_List_Each_Fragment)
}
# Calling all the funcionts in order
allFragments<-Processing_fragments("/Users/CamilaMV/Desktop/TESIS")
matrix_PCA<-Getting_frequency_account(allFragments,4)
First_three_components<-Getting_first_three_components(matrix_PCA)
Hex_color_list<-Generating_hex_color_codes(First_three_components)