使用 python 和 Biopython 加入不同的 FASTA 文件
Join distinct FASTA files using python and Biopython
我必须创建一个软件来选择多个 fasta 文件并创建另一个包含所有序列的软件。为此,我完成了以下代码:
import sys,random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
# Use: python join_fasta.py infile1.fasta infile2.fasta outfile.fasta
infile1 = sys.argv[1] #Name of the input file
seq1 = list(SeqIO.parse(infile1,"fasta")) #Create a list with all the sequence records
print "Input the first fasta file = ", infile1
infile2 = sys.argv[2] #Name of the input file
seq2 = list(SeqIO.parse(infile2,"fasta")) #Create a list with all the sequence records
print "Input the second fasta file = ", infile2
totseq1 = len(seq1) #Total number of sequences in the input file
print "Number of sequences in the first file = ", totseq1
totseq2 = len(seq2) #Total number of sequences in the input file
print "Number of sequences in the first file = ", totseq2
outfile = sys.argv[3] #Name of the output file
print "Output fasta file = ", outfile
totseq3 = len(seq1) + len(seq2) #Total number of sequences in the output file
print "Number of sequences in the concatenated file = ", totseq3
output = [infile1, infile2]
SeqIO.write(output, outfile, "fasta")
exit()
它return一个错误:
Input the first fasta file = ITS.fasta
Input the second fasta file = banco_de_dados_ITS.fasta
Number of sequences in the first file = 114
Number of sequences in the first file = 20
Output fasta file = ITS_combined.fasta
Number of sequences in the concatenated file = 134
Traceback (most recent call last):
File "/Users/phdias/Documents/Softwares_PH/join_fasta.py", line 35, in <module>
SeqIO.write(output, outfile, "fasta")
File "/usr/local/lib/python2.7/site-packages/Bio/SeqIO/__init__.py", line 535, in write
fp.write(format_function(record))
File "/usr/local/lib/python2.7/site-packages/Bio/SeqIO/FastaIO.py", line 345, in as_fasta
id = _clean(record.id)
AttributeError: 'str' object has no attribute 'id'
我尝试了很多都没有成功解决这个问题
代码被重写:
import sys,random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
# Use: python join_fasta.py infile1.fasta infile2.fasta outfile.fasta
infile1 = sys.argv[1] #Name of the input file
seq1 = list(SeqIO.parse(infile1,"fasta")) #Create a list with all the sequence records
print "Input the first fasta file = ", infile1
infile2 = sys.argv[2] #Name of the input file
seq2 = list(SeqIO.parse(infile2,"fasta")) #Create a list with all the sequence records
print "Input the second fasta file = ", infile2
totseq1 = len(seq1) #Total number of sequences in the input file
print "Number of sequences in the first file = ", totseq1
totseq2 = len(seq2) #Total number of sequences in the input file
print "Number of sequences in the second file = ", totseq2
outfile = sys.argv[3] #Name of the output file
print "Output fasta file = ", outfile
totseq3 = len(seq1) + len(seq2) #Total number of sequences in the output file
print "Number of sequences in the concatenated file = ", totseq3
from itertools import chain
output = list(chain(seq1, seq2))
SeqIO.write(output, outfile, "fasta")
exit()
现在找不到第二个文件,输出文件只包含第一个文件的序列:
Input the first fasta file = ITS.fasta
Input the second fasta file = banco_de_dados_ITS.fasta
Number of sequences in the first file = 114
Number of sequences in the second file = 0
Output fasta file = its_c.fasta
Number of sequences in the concatenated file = 114
我认为问题出在这里:
output = [infile1, infile2]
您正在使用 str
类型的文件路径,而不是使用 seq1
和 seq2
,并且您还需要将它们连接成一个序列,如下所示:
from itertools import chain
output = chain(seq1, seq2)
这样 output
是来自 seq1
和 seq2
的 SeqRecord
项的生成器
虽然 BioPython 是一个很棒的工具,但如果您使用 pandas,事情会变得更容易。
import pandas as pd
def fasta_reader(file):
fasta_df = pd.read_csv(file, sep='>', lineterminator='>', header=None)
fasta_df[['Accession', 'Sequence']] = fasta_df[0].str.split('\n', 1, \
expand=True)
fasta_df.drop(0, axis=1, inplace=True)
fasta_df['Sequence'] = fasta_df['Sequence'].replace('\n', '', regex=True)
return fasta_df
如果你的fasta文件被命名为test1.fa
,test2.fa
,你可以使用pd.concat
加入
df = pd.concat(fasta_reader(i) for i in ['test1.fa', 'test2.fa'])
然后您还可以将合并后的文件导出为fasta。但是首先需要在accessions前加>
# Exporting to fa
# adding '>' for accessions
df['Accession'] = '>' + df['Accession']
df.to_csv('combined.fa', sep='\n', index=None, header=None)
我必须创建一个软件来选择多个 fasta 文件并创建另一个包含所有序列的软件。为此,我完成了以下代码:
import sys,random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
# Use: python join_fasta.py infile1.fasta infile2.fasta outfile.fasta
infile1 = sys.argv[1] #Name of the input file
seq1 = list(SeqIO.parse(infile1,"fasta")) #Create a list with all the sequence records
print "Input the first fasta file = ", infile1
infile2 = sys.argv[2] #Name of the input file
seq2 = list(SeqIO.parse(infile2,"fasta")) #Create a list with all the sequence records
print "Input the second fasta file = ", infile2
totseq1 = len(seq1) #Total number of sequences in the input file
print "Number of sequences in the first file = ", totseq1
totseq2 = len(seq2) #Total number of sequences in the input file
print "Number of sequences in the first file = ", totseq2
outfile = sys.argv[3] #Name of the output file
print "Output fasta file = ", outfile
totseq3 = len(seq1) + len(seq2) #Total number of sequences in the output file
print "Number of sequences in the concatenated file = ", totseq3
output = [infile1, infile2]
SeqIO.write(output, outfile, "fasta")
exit()
它return一个错误:
Input the first fasta file = ITS.fasta
Input the second fasta file = banco_de_dados_ITS.fasta
Number of sequences in the first file = 114
Number of sequences in the first file = 20
Output fasta file = ITS_combined.fasta
Number of sequences in the concatenated file = 134
Traceback (most recent call last):
File "/Users/phdias/Documents/Softwares_PH/join_fasta.py", line 35, in <module>
SeqIO.write(output, outfile, "fasta")
File "/usr/local/lib/python2.7/site-packages/Bio/SeqIO/__init__.py", line 535, in write
fp.write(format_function(record))
File "/usr/local/lib/python2.7/site-packages/Bio/SeqIO/FastaIO.py", line 345, in as_fasta
id = _clean(record.id)
AttributeError: 'str' object has no attribute 'id'
我尝试了很多都没有成功解决这个问题
代码被重写:
import sys,random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
# Use: python join_fasta.py infile1.fasta infile2.fasta outfile.fasta
infile1 = sys.argv[1] #Name of the input file
seq1 = list(SeqIO.parse(infile1,"fasta")) #Create a list with all the sequence records
print "Input the first fasta file = ", infile1
infile2 = sys.argv[2] #Name of the input file
seq2 = list(SeqIO.parse(infile2,"fasta")) #Create a list with all the sequence records
print "Input the second fasta file = ", infile2
totseq1 = len(seq1) #Total number of sequences in the input file
print "Number of sequences in the first file = ", totseq1
totseq2 = len(seq2) #Total number of sequences in the input file
print "Number of sequences in the second file = ", totseq2
outfile = sys.argv[3] #Name of the output file
print "Output fasta file = ", outfile
totseq3 = len(seq1) + len(seq2) #Total number of sequences in the output file
print "Number of sequences in the concatenated file = ", totseq3
from itertools import chain
output = list(chain(seq1, seq2))
SeqIO.write(output, outfile, "fasta")
exit()
现在找不到第二个文件,输出文件只包含第一个文件的序列:
Input the first fasta file = ITS.fasta
Input the second fasta file = banco_de_dados_ITS.fasta
Number of sequences in the first file = 114
Number of sequences in the second file = 0
Output fasta file = its_c.fasta
Number of sequences in the concatenated file = 114
我认为问题出在这里:
output = [infile1, infile2]
您正在使用 str
类型的文件路径,而不是使用 seq1
和 seq2
,并且您还需要将它们连接成一个序列,如下所示:
from itertools import chain
output = chain(seq1, seq2)
这样 output
是来自 seq1
和 seq2
SeqRecord
项的生成器
虽然 BioPython 是一个很棒的工具,但如果您使用 pandas,事情会变得更容易。
import pandas as pd
def fasta_reader(file):
fasta_df = pd.read_csv(file, sep='>', lineterminator='>', header=None)
fasta_df[['Accession', 'Sequence']] = fasta_df[0].str.split('\n', 1, \
expand=True)
fasta_df.drop(0, axis=1, inplace=True)
fasta_df['Sequence'] = fasta_df['Sequence'].replace('\n', '', regex=True)
return fasta_df
如果你的fasta文件被命名为test1.fa
,test2.fa
,你可以使用pd.concat
加入
df = pd.concat(fasta_reader(i) for i in ['test1.fa', 'test2.fa'])
然后您还可以将合并后的文件导出为fasta。但是首先需要在accessions前加>
# Exporting to fa
# adding '>' for accessions
df['Accession'] = '>' + df['Accession']
df.to_csv('combined.fa', sep='\n', index=None, header=None)