KeyError: 'm' when building a codon alignment using Bio.codonalign
KeyError: 'm' when building a codon alignment using Bio.codonalign
我正在尝试使用 Bio.codonalign 根据基于蛋白质比对的密码子比对两个基因序列。他们的示例在此处给出(在 'build' 函数下):https://biopython.org/DIST/docs/api/Bio.codonalign-module.html。我已经尝试了他们的示例并且它起作用了。
现在,我想要从 FASTA 文件中获取序列(ap_20 具有对齐的蛋白质,ug_20 具有未对齐的基因)。以下是我的代码。
# Import packages
from Bio.Alphabet import generic_dna, generic_protein
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.codonalign import build
# Define set of orthologous genes and proteins
genes = list(SeqIO.parse("ug_20.fasta", "fasta"))
proteins = list(SeqIO.parse("ap_20.fasta", "fasta"))
# Assign individual sequences to variables
seq1 = SeqRecord(Seq(str(genes[0].seq), alphabet=generic_dna), id="pro1")
seq2 = SeqRecord(Seq(str(genes[1].seq), alphabet=generic_dna), id="pro2")
pro1 = SeqRecord(Seq(str(proteins[0].seq), alphabet=generic_protein), id="pro1")
pro2 = SeqRecord(Seq(str(proteins[1].seq), alphabet=generic_protein), id="pro2")
# MultipleSeqAlignment reads the protein alignment
aln = MultipleSeqAlignment([pro1, pro2])
print(aln)
# Build codon alignment
codon_aln = build(aln, [seq1, seq2])
print(codon_aln)
aln
有效,但最后 build
步骤无效。我收到以下错误。我不确定 KeyError: 'm'
是什么意思,但我知道我所有的蛋白质序列都以字母 'm' 开头。我用“...”替换了部分文件路径以保持简短。
Traceback (most recent call last):
File "/Users/.../tempCodeRunnerFile.py", line 30, in <module>
codon_aln = build(aln, [seq1, seq2])
File "/Users/.../anaconda3/lib/python3.6/site-packages/Bio/codonalign/__init__.py", line 168, in build
anchor_len=anchor_len)
File "/Users/.../anaconda3/lib/python3.6/site-packages/Bio/codonalign/__init__.py", line 261, in _check_corr
pro_re += aa2re[aa]
KeyError: 'm'
您没有提供(部分)输入文件 ug_20.fasta
和 ap_20.fasta
,这让我更难调试,但我可以使用以下代码触发类似的错误:
>>> from Bio.Alphabet import generic_dna, generic_protein
>>> from Bio.Seq import Seq
>>> from Bio.SeqRecord import SeqRecord
>>> from Bio.Align import MultipleSeqAlignment
>>> from Bio.codonalign import build
>>> seq1 = SeqRecord(Seq('ATGTCTCGT', alphabet=generic_dna), id='pro1')
>>> seq2 = SeqRecord(Seq('ATGCGT', alphabet=generic_dna), id='pro2')
>>> pro1 = SeqRecord(Seq('MSR', alphabet=generic_protein), id='pro1')
>>> pro2 = SeqRecord(Seq('m-R', alphabet=generic_protein), id='pro2')
>>> aln = MultipleSeqAlignment([pro1, pro2])
>>> codon_aln = build(aln, [seq1, seq2])
>>> print(codon_aln)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-25-da1b827fb67e> in <module>()
9 pro2 = SeqRecord(Seq('m-R', alphabet=generic_protein), id='pro2')
10 aln = MultipleSeqAlignment([pro1, pro2])
---> 11 codon_aln = build(aln, [seq1, seq2])
12 print(codon_aln)
1 frames
/usr/local/lib/python3.6/dist-packages/Bio/codonalign/__init__.py in _check_corr(pro, nucl, gap_char, codon_table, complete_protein, anchor_len)
259 for aa in pro.seq:
260 if aa != gap_char:
--> 261 pro_re += aa2re[aa]
262
263 nucl_seq = str(nucl.seq.upper().ungap(gap_char))
KeyError: 'm'
这是默认的 Bio.codealign.build 示例,只有一处更改:在 pro2
中,我将 'M-R'
更改为 'm-R'
。所以这向我暗示你的一个蛋白质序列包含小写字符,而 Bio.codealign.build()
似乎需要大写字符。您可以像这样将蛋白质序列转换为大写字母:
pro1 = SeqRecord(Seq(str(proteins[0].seq.upper()), alphabet=generic_protein), id="pro1")
pro2 = SeqRecord(Seq(str(proteins[1].seq.upper()), alphabet=generic_protein), id="pro2")
我正在尝试使用 Bio.codonalign 根据基于蛋白质比对的密码子比对两个基因序列。他们的示例在此处给出(在 'build' 函数下):https://biopython.org/DIST/docs/api/Bio.codonalign-module.html。我已经尝试了他们的示例并且它起作用了。
现在,我想要从 FASTA 文件中获取序列(ap_20 具有对齐的蛋白质,ug_20 具有未对齐的基因)。以下是我的代码。
# Import packages
from Bio.Alphabet import generic_dna, generic_protein
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.codonalign import build
# Define set of orthologous genes and proteins
genes = list(SeqIO.parse("ug_20.fasta", "fasta"))
proteins = list(SeqIO.parse("ap_20.fasta", "fasta"))
# Assign individual sequences to variables
seq1 = SeqRecord(Seq(str(genes[0].seq), alphabet=generic_dna), id="pro1")
seq2 = SeqRecord(Seq(str(genes[1].seq), alphabet=generic_dna), id="pro2")
pro1 = SeqRecord(Seq(str(proteins[0].seq), alphabet=generic_protein), id="pro1")
pro2 = SeqRecord(Seq(str(proteins[1].seq), alphabet=generic_protein), id="pro2")
# MultipleSeqAlignment reads the protein alignment
aln = MultipleSeqAlignment([pro1, pro2])
print(aln)
# Build codon alignment
codon_aln = build(aln, [seq1, seq2])
print(codon_aln)
aln
有效,但最后 build
步骤无效。我收到以下错误。我不确定 KeyError: 'm'
是什么意思,但我知道我所有的蛋白质序列都以字母 'm' 开头。我用“...”替换了部分文件路径以保持简短。
Traceback (most recent call last):
File "/Users/.../tempCodeRunnerFile.py", line 30, in <module>
codon_aln = build(aln, [seq1, seq2])
File "/Users/.../anaconda3/lib/python3.6/site-packages/Bio/codonalign/__init__.py", line 168, in build
anchor_len=anchor_len)
File "/Users/.../anaconda3/lib/python3.6/site-packages/Bio/codonalign/__init__.py", line 261, in _check_corr
pro_re += aa2re[aa]
KeyError: 'm'
您没有提供(部分)输入文件 ug_20.fasta
和 ap_20.fasta
,这让我更难调试,但我可以使用以下代码触发类似的错误:
>>> from Bio.Alphabet import generic_dna, generic_protein
>>> from Bio.Seq import Seq
>>> from Bio.SeqRecord import SeqRecord
>>> from Bio.Align import MultipleSeqAlignment
>>> from Bio.codonalign import build
>>> seq1 = SeqRecord(Seq('ATGTCTCGT', alphabet=generic_dna), id='pro1')
>>> seq2 = SeqRecord(Seq('ATGCGT', alphabet=generic_dna), id='pro2')
>>> pro1 = SeqRecord(Seq('MSR', alphabet=generic_protein), id='pro1')
>>> pro2 = SeqRecord(Seq('m-R', alphabet=generic_protein), id='pro2')
>>> aln = MultipleSeqAlignment([pro1, pro2])
>>> codon_aln = build(aln, [seq1, seq2])
>>> print(codon_aln)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-25-da1b827fb67e> in <module>()
9 pro2 = SeqRecord(Seq('m-R', alphabet=generic_protein), id='pro2')
10 aln = MultipleSeqAlignment([pro1, pro2])
---> 11 codon_aln = build(aln, [seq1, seq2])
12 print(codon_aln)
1 frames
/usr/local/lib/python3.6/dist-packages/Bio/codonalign/__init__.py in _check_corr(pro, nucl, gap_char, codon_table, complete_protein, anchor_len)
259 for aa in pro.seq:
260 if aa != gap_char:
--> 261 pro_re += aa2re[aa]
262
263 nucl_seq = str(nucl.seq.upper().ungap(gap_char))
KeyError: 'm'
这是默认的 Bio.codealign.build 示例,只有一处更改:在 pro2
中,我将 'M-R'
更改为 'm-R'
。所以这向我暗示你的一个蛋白质序列包含小写字符,而 Bio.codealign.build()
似乎需要大写字符。您可以像这样将蛋白质序列转换为大写字母:
pro1 = SeqRecord(Seq(str(proteins[0].seq.upper()), alphabet=generic_protein), id="pro1")
pro2 = SeqRecord(Seq(str(proteins[1].seq.upper()), alphabet=generic_protein), id="pro2")