使用 Biopython 将多序列 fasta 蛋白质文件拆分为多个文件
Splitting a multisequence fasta protein file into many files using Biopython
def batch_iterator(iterator, batch_size) :
entry = True
while entry :
batch = []
while len(batch) < batch_size :
try :
entry = iterator.__next__
except StopIteration :
entry = None
if entry is None :
#End of file
break
batch.append(entry)
if batch :
yield batch
from Bio import SeqIO
record_iter = SeqIO.parse(open("C:\Users\IDEAPAD\Desktop\fypsplit\protein.fasta"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 1000)):
filename = "group_%i.fasta" % (i + 1)
with open(filename, "w") as handle:
count = SeqIO.write(batch, handle, "fasta")
print("Wrote %i records to %s" % (count, filename))
我正在尝试使用 Biopython 拆分一个 fasta 文件。在这个例子中我想把它变成 7 个文件。但是我在读取 AttributeError: 'function' object has no attribute 'id'
.
时遇到错误
有人可以帮助我吗?提前谢谢你
这一行抛出了AttributeError
count = SeqIO.write(batch, handle, "fasta")
因为 SeqIO.write
需要类型 SeqRecord
的可迭代对象或列表。但是,您的 batch_iterator
生成了 方法列表 。
为什么是方法?嗯,你在这里错过了一个函数调用:
entry = iterator.__next__
应该是
entry = iterator.__next__()
这使得代码 运行 无误通过。
对于包含 11 个序列的测试文件,我得到了以下结果 - 在将批量大小从 1000 更改为 4 以进行测试后:
Wrote 4 records to group_1.fasta
Wrote 4 records to group_2.fasta
Wrote 3 records to group_3.fasta
使用可用模块:more-itertools
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 31 08:30:32 2020
@author: Pietro
"""
from Bio import SeqIO
from more_itertools import ichunked
fasta_file = ""C:\Users\IDEAPAD\Desktop\fypsplit\protein.fasta""
seqA = record_iter = SeqIO.parse(open(fasta_file),"fasta")
group = 1
for chunk in ichunked(seqA, 1000):
with open(fasta_file+'_group_'+str(group), "w+") as file_write:
for seq_record in chunk:
SeqIO.write((seq_record),file_write, "fasta")
group += 1
def batch_iterator(iterator, batch_size) :
entry = True
while entry :
batch = []
while len(batch) < batch_size :
try :
entry = iterator.__next__
except StopIteration :
entry = None
if entry is None :
#End of file
break
batch.append(entry)
if batch :
yield batch
from Bio import SeqIO
record_iter = SeqIO.parse(open("C:\Users\IDEAPAD\Desktop\fypsplit\protein.fasta"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 1000)):
filename = "group_%i.fasta" % (i + 1)
with open(filename, "w") as handle:
count = SeqIO.write(batch, handle, "fasta")
print("Wrote %i records to %s" % (count, filename))
我正在尝试使用 Biopython 拆分一个 fasta 文件。在这个例子中我想把它变成 7 个文件。但是我在读取 AttributeError: 'function' object has no attribute 'id'
.
有人可以帮助我吗?提前谢谢你
这一行抛出了AttributeError
count = SeqIO.write(batch, handle, "fasta")
因为 SeqIO.write
需要类型 SeqRecord
的可迭代对象或列表。但是,您的 batch_iterator
生成了 方法列表 。
为什么是方法?嗯,你在这里错过了一个函数调用:
entry = iterator.__next__
应该是
entry = iterator.__next__()
这使得代码 运行 无误通过。
对于包含 11 个序列的测试文件,我得到了以下结果 - 在将批量大小从 1000 更改为 4 以进行测试后:
Wrote 4 records to group_1.fasta
Wrote 4 records to group_2.fasta
Wrote 3 records to group_3.fasta
使用可用模块:more-itertools
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 31 08:30:32 2020
@author: Pietro
"""
from Bio import SeqIO
from more_itertools import ichunked
fasta_file = ""C:\Users\IDEAPAD\Desktop\fypsplit\protein.fasta""
seqA = record_iter = SeqIO.parse(open(fasta_file),"fasta")
group = 1
for chunk in ichunked(seqA, 1000):
with open(fasta_file+'_group_'+str(group), "w+") as file_write:
for seq_record in chunk:
SeqIO.write((seq_record),file_write, "fasta")
group += 1