反转录
Reverse Transcription
objective 是根据完整的 mRNA 序列和氨基酸序列得到 mRNA 的编码序列。然后将所有这些放入密码子格式。
我觉得我已经找到了可能的列表密码子。我只是不确定如何系统地匹配给定的 mRNA 序列。所以这就是我到目前为止所拥有的。
xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'
d = {'mRNA': ['UUU','UUC','UUA','UUG','UCU','UCC','UCA','UCG','UAU','UAC','UAA','UAG','UGU','UGC','UGA','UGG','CUU','CUC','CUA','CUG','CCU','CCC','CCA','CCG','CAU','CAC','CAA','CAG','CGU','CGC','CGA','CGG','AUU','AUC','AUA','AUG','ACU','ACC','ACA','ACG','AAU','AAC','AAA','AAG','AGU','AGC','AGA','AGG','GUU','GUC','GUA','GUG','GCU','GCC','GCA','GCG','GAU','GAC','GAA','GAG','GGU','GGC','GGA','GGG'], 'AA': ['F','F','L','L','S','S','S','S','Y','Y','_','_','C','C','_','W','L','L','L','L','P','P','P','P','H','H','Q','Q','R','R','R','R','I','I','M','M','T','T','T','T','N','N','K','K','S','S','R','R','V','V','V','V','A','A','A','A','D','D','E','E','G','G','G','G']}
AA= pandas.DataFrame(data=d)
for i in xAA:
codons = list(AA.mRNA.loc[AA['AA'] == i])
print codons
这是输出:
['AUA', 'AUG']
['GAU', 'GAC']
['UUU', 'UUC']
['UUU', 'UUC']
['GCU', 'GCC', 'GCA', 'GCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GGU', 'GGC', 'GGA', 'GGG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CCU', 'CCC', 'CCA', 'CCG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['GUU', 'GUC', 'GUA', 'GUG']
['ACU', 'ACC', 'ACA', 'ACG']
['GAA', 'GAG']
['GAA', 'GAG']
['ACU', 'ACC', 'ACA', 'ACG']
['CCU', 'CCC', 'CCA', 'CCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GGU', 'GGC', 'GGA', 'GGG']
['GAA', 'GAG']
['GCU', 'GCC', 'GCA', 'GCG']
['GGU', 'GGC', 'GGA', 'GGG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GAA', 'GAG']
['GAA', 'GAG']
['GAU', 'GAC']
['GAU', 'GAC']
['GAA', 'GAG']
['GUU', 'GUC', 'GUA', 'GUG']
['GUU', 'GUC', 'GUA', 'GUG']
['GCU', 'GCC', 'GCA', 'GCG']
['AUA', 'AUG']
['AUU', 'AUC']
['AAA', 'AAG']
['GAA', 'GAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['GAU', 'GAC']
['ACU', 'ACC', 'ACA', 'ACG']
['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']
['AUU', 'AUC']
['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']
['CCU', 'CCC', 'CCA', 'CCG']
['ACU', 'ACC', 'ACA', 'ACG']
['GUU', 'GUC', 'GUA', 'GUG']
['CAA', 'CAG']
['GAA', 'GAG']
['GAU', 'GAC']
['GGU', 'GGC', 'GGA', 'GGG']
['GGU', 'GGC', 'GGA', 'GGG']
['GAU', 'GAC']
['GUU', 'GUC', 'GUA', 'GUG']
['AUU', 'AUC']
['UAU', 'UAC']
['AAA', 'AAG']
['GGU', 'GGC', 'GGA', 'GGG']
['UUU', 'UUC']
['GAA', 'GAG']
['GAU', 'GAC']
['GGU', 'GGC', 'GGA', 'GGG']
['AUU', 'AUC']
['GUU', 'GUC', 'GUA', 'GUG']
['CAA', 'CAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['AAA', 'AAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CAA', 'CAG']
['GGU', 'GGC', 'GGA', 'GGG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UGU', 'UGC']
['ACU', 'ACC', 'ACA', 'ACG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UGU', 'UGC']
['CCU', 'CCC', 'CCA', 'CCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['AUU', 'AUC']
['AUU', 'AUC']
['ACU', 'ACC', 'ACA', 'ACG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['AAA', 'AAG']
['AAU', 'AAC']
['GGU', 'GGC', 'GGA', 'GGG']
['AUU', 'AUC']
['CAA', 'CAG']
['AAU', 'AAC']
['AUA', 'AUG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CAA', 'CAG']
['UUU', 'UUC']
['UAU', 'UAC']
['AUU', 'AUC']
['CCU', 'CCC', 'CCA', 'CCG']
['GAA', 'GAG']
['GUU', 'GUC', 'GUA', 'GUG']
['GAA', 'GAG']
['GGU', 'GGC', 'GGA', 'GGG']
['GUU', 'GUC', 'GUA', 'GUG']
['GAA', 'GAG']
['CAA', 'CAG']
['GUU', 'GUC', 'GUA', 'GUG']
['AUA', 'AUG']
['GAU', 'GAC']
['GAU', 'GAC']
['GAA', 'GAG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GAU', 'GAC']
['GAA', 'GAG']
['AAA', 'AAG']
['GAA', 'GAG']
['GCU', 'GCC', 'GCA', 'GCG']
['AAU', 'AAC']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['CCU', 'CCC', 'CCA', 'CCG']
如果我添加此处显示的 for c 循环,我会得到
codingseq = ""
for i in xAA:
codons = list(AA.mRNA.loc[AA['AA'] == i])
for c in codons:
xmRNA.find(c)
codingseq+= c
这给出了每个组合,有没有一种方法可以进行比较分析,找出其中哪一个最像完整的 mRNA 序列?
AUAAUG
AUAAUGGAUGAC
AUAAUGGAUGACUUUUUC
AUAAUGGAUGACUUUUUCUUUUUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUCUAUUAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUCUAUUACAAAAAG
请注意,由于超出了字符数限制,因此并未显示所有结果。
任何对此的帮助都会很棒!
这会在 xmRNA 中找到与 xAA 匹配的密码子序列。注意 d["AA"] 数据在索引 34 处进行了更正("M" 替换为 "I"),以匹配 http://web.expasy.org/ 站点上使用的翻译。我没用过pandas
,就是普通的Python。我只是把这个作为一个简单的测试来尝试找出 xmRNA 中 xAA 的位置(以及使用的密码子)。即使对于很长的序列,它也应该足够快(即使是 100000 个 RNA 也应该接近即时)。
xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'
d = {'mRNA': ['UUU','UUC','UUA','UUG','UCU','UCC','UCA','UCG','UAU','UAC','UAA','UAG','UGU','UGC','UGA','UGG','CUU','CUC','CUA','CUG','CCU','CCC','CCA','CCG','CAU','CAC','CAA','CAG','CGU','CGC','CGA','CGG','AUU','AUC','AUA','AUG','ACU','ACC','ACA','ACG','AAU','AAC','AAA','AAG','AGU','AGC','AGA','AGG','GUU','GUC','GUA','GUG','GCU','GCC','GCA','GCG','GAU','GAC','GAA','GAG','GGU','GGC','GGA','GGG'],
'AA': ['F', 'F', 'L', 'L', 'S', 'S', 'S', 'S', 'Y', 'Y', '_', '_', 'C', 'C', '_','W','L','L','L','L','P','P','P','P','H','H','Q','Q','R','R','R','R','I','I', 'I' ,'M','T','T','T','T','N','N','K','K','S','S','R','R','V','V','V','V','A','A','A','A','D','D','E','E','G','G','G','G']}
r2a = { d['mRNA'][i] : d['AA'][i] for i in range(len(d['AA'])) }
s=0
found = False
for s in (0,1,2):
# t3 = codon sequence starting from s
t = list(xmRNA[s:])
t3 = [ t[i]+t[i+1]+t[i+2] for i in range(0,len(t)-2,3) ]
# transcribe to AAs
aa = [ r2a[i] for i in t3 ]
aa = ''.join(aa)
print (aa)
try:
idx = aa.index(xAA)
except ValueError:
continue
# found it
pos = idx*3 + s
t3 = t3[idx:idx+len(xAA)]
found = True
break
if found:
print ("found in frame {} at {} (pos={})".format(s+1, idx, pos))
print ("codons: " + repr(t3))
else:
print ("Not found")
这是一个相当紧凑的解决方案。当你创建一个反向字典时你没有任何冲突,所以你可以这样做:
codon_dict = {'F': ('UUU', 'UUC'),
'L': ('UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'),
'S': ('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'),
'Y': ('UAU', 'UAC'),
'_': ('UAA', 'UAG', 'UGA'),
'C': ('UGU', 'UGC'),
'W': ('UGG',),
'P': ('CCU', 'CCC', 'CCA', 'CCG'),
'H': ('CAU', 'CAC'),
'Q': ('CAA', 'CAG'),
'R': ('CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'),
'I': ('AUU', 'AUC', 'AUA'),
'M': ('AUG',),
'T': ('ACU', 'ACC', 'ACA', 'ACG'),
'N': ('AAU', 'AAC'),
'K': ('AAA', 'AAG'),
'V': ('GUU', 'GUC', 'GUA', 'GUG'),
'A': ('GCU', 'GCC', 'GCA', 'GCG'),
'D': ('GAU', 'GAC'),
'E': ('GAA', 'GAG'),
'G': ('GGU', 'GGC', 'GGA', 'GGG')
}
rna_dict = {}
for k, v in codon_dict.items():
for val in v:
rna_dict[val] = k
xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'
mapped = [rna_dict[x] for x in [xmRNA[i:i+3] for i in range(0, len(xmRNA)-1, 3)]]
xmRNA_index = "".join(mapped).find(xAA) * 3
print(xmRNA_index)
这个returns 486,这是xAA密码子所在的位置。它可以通过对字符串进行边界检查来进一步充实,如果您只想要最接近的匹配,您可以对子字符串进行组合匹配(我仍然会进行反向映射,然后在 space 中搜索,但速度更快),但是是的。我也不知道不映射的缺失 characters/triplets 有多常见,因此可能需要针对这些情况进行修改。
objective 是根据完整的 mRNA 序列和氨基酸序列得到 mRNA 的编码序列。然后将所有这些放入密码子格式。 我觉得我已经找到了可能的列表密码子。我只是不确定如何系统地匹配给定的 mRNA 序列。所以这就是我到目前为止所拥有的。
xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'
d = {'mRNA': ['UUU','UUC','UUA','UUG','UCU','UCC','UCA','UCG','UAU','UAC','UAA','UAG','UGU','UGC','UGA','UGG','CUU','CUC','CUA','CUG','CCU','CCC','CCA','CCG','CAU','CAC','CAA','CAG','CGU','CGC','CGA','CGG','AUU','AUC','AUA','AUG','ACU','ACC','ACA','ACG','AAU','AAC','AAA','AAG','AGU','AGC','AGA','AGG','GUU','GUC','GUA','GUG','GCU','GCC','GCA','GCG','GAU','GAC','GAA','GAG','GGU','GGC','GGA','GGG'], 'AA': ['F','F','L','L','S','S','S','S','Y','Y','_','_','C','C','_','W','L','L','L','L','P','P','P','P','H','H','Q','Q','R','R','R','R','I','I','M','M','T','T','T','T','N','N','K','K','S','S','R','R','V','V','V','V','A','A','A','A','D','D','E','E','G','G','G','G']}
AA= pandas.DataFrame(data=d)
for i in xAA:
codons = list(AA.mRNA.loc[AA['AA'] == i])
print codons
这是输出:
['AUA', 'AUG']
['GAU', 'GAC']
['UUU', 'UUC']
['UUU', 'UUC']
['GCU', 'GCC', 'GCA', 'GCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GGU', 'GGC', 'GGA', 'GGG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CCU', 'CCC', 'CCA', 'CCG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['GUU', 'GUC', 'GUA', 'GUG']
['ACU', 'ACC', 'ACA', 'ACG']
['GAA', 'GAG']
['GAA', 'GAG']
['ACU', 'ACC', 'ACA', 'ACG']
['CCU', 'CCC', 'CCA', 'CCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GGU', 'GGC', 'GGA', 'GGG']
['GAA', 'GAG']
['GCU', 'GCC', 'GCA', 'GCG']
['GGU', 'GGC', 'GGA', 'GGG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GAA', 'GAG']
['GAA', 'GAG']
['GAU', 'GAC']
['GAU', 'GAC']
['GAA', 'GAG']
['GUU', 'GUC', 'GUA', 'GUG']
['GUU', 'GUC', 'GUA', 'GUG']
['GCU', 'GCC', 'GCA', 'GCG']
['AUA', 'AUG']
['AUU', 'AUC']
['AAA', 'AAG']
['GAA', 'GAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['GAU', 'GAC']
['ACU', 'ACC', 'ACA', 'ACG']
['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']
['AUU', 'AUC']
['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']
['CCU', 'CCC', 'CCA', 'CCG']
['ACU', 'ACC', 'ACA', 'ACG']
['GUU', 'GUC', 'GUA', 'GUG']
['CAA', 'CAG']
['GAA', 'GAG']
['GAU', 'GAC']
['GGU', 'GGC', 'GGA', 'GGG']
['GGU', 'GGC', 'GGA', 'GGG']
['GAU', 'GAC']
['GUU', 'GUC', 'GUA', 'GUG']
['AUU', 'AUC']
['UAU', 'UAC']
['AAA', 'AAG']
['GGU', 'GGC', 'GGA', 'GGG']
['UUU', 'UUC']
['GAA', 'GAG']
['GAU', 'GAC']
['GGU', 'GGC', 'GGA', 'GGG']
['AUU', 'AUC']
['GUU', 'GUC', 'GUA', 'GUG']
['CAA', 'CAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['AAA', 'AAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CAA', 'CAG']
['GGU', 'GGC', 'GGA', 'GGG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UGU', 'UGC']
['ACU', 'ACC', 'ACA', 'ACG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UGU', 'UGC']
['CCU', 'CCC', 'CCA', 'CCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['AUU', 'AUC']
['AUU', 'AUC']
['ACU', 'ACC', 'ACA', 'ACG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['AAA', 'AAG']
['AAU', 'AAC']
['GGU', 'GGC', 'GGA', 'GGG']
['AUU', 'AUC']
['CAA', 'CAG']
['AAU', 'AAC']
['AUA', 'AUG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CAA', 'CAG']
['UUU', 'UUC']
['UAU', 'UAC']
['AUU', 'AUC']
['CCU', 'CCC', 'CCA', 'CCG']
['GAA', 'GAG']
['GUU', 'GUC', 'GUA', 'GUG']
['GAA', 'GAG']
['GGU', 'GGC', 'GGA', 'GGG']
['GUU', 'GUC', 'GUA', 'GUG']
['GAA', 'GAG']
['CAA', 'CAG']
['GUU', 'GUC', 'GUA', 'GUG']
['AUA', 'AUG']
['GAU', 'GAC']
['GAU', 'GAC']
['GAA', 'GAG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GAU', 'GAC']
['GAA', 'GAG']
['AAA', 'AAG']
['GAA', 'GAG']
['GCU', 'GCC', 'GCA', 'GCG']
['AAU', 'AAC']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['CCU', 'CCC', 'CCA', 'CCG']
如果我添加此处显示的 for c 循环,我会得到
codingseq = ""
for i in xAA:
codons = list(AA.mRNA.loc[AA['AA'] == i])
for c in codons:
xmRNA.find(c)
codingseq+= c
这给出了每个组合,有没有一种方法可以进行比较分析,找出其中哪一个最像完整的 mRNA 序列?
AUAAUG
AUAAUGGAUGAC
AUAAUGGAUGACUUUUUC
AUAAUGGAUGACUUUUUCUUUUUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUCUAUUAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUCUAUUACAAAAAG
请注意,由于超出了字符数限制,因此并未显示所有结果。 任何对此的帮助都会很棒!
这会在 xmRNA 中找到与 xAA 匹配的密码子序列。注意 d["AA"] 数据在索引 34 处进行了更正("M" 替换为 "I"),以匹配 http://web.expasy.org/ 站点上使用的翻译。我没用过pandas
,就是普通的Python。我只是把这个作为一个简单的测试来尝试找出 xmRNA 中 xAA 的位置(以及使用的密码子)。即使对于很长的序列,它也应该足够快(即使是 100000 个 RNA 也应该接近即时)。
xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'
d = {'mRNA': ['UUU','UUC','UUA','UUG','UCU','UCC','UCA','UCG','UAU','UAC','UAA','UAG','UGU','UGC','UGA','UGG','CUU','CUC','CUA','CUG','CCU','CCC','CCA','CCG','CAU','CAC','CAA','CAG','CGU','CGC','CGA','CGG','AUU','AUC','AUA','AUG','ACU','ACC','ACA','ACG','AAU','AAC','AAA','AAG','AGU','AGC','AGA','AGG','GUU','GUC','GUA','GUG','GCU','GCC','GCA','GCG','GAU','GAC','GAA','GAG','GGU','GGC','GGA','GGG'],
'AA': ['F', 'F', 'L', 'L', 'S', 'S', 'S', 'S', 'Y', 'Y', '_', '_', 'C', 'C', '_','W','L','L','L','L','P','P','P','P','H','H','Q','Q','R','R','R','R','I','I', 'I' ,'M','T','T','T','T','N','N','K','K','S','S','R','R','V','V','V','V','A','A','A','A','D','D','E','E','G','G','G','G']}
r2a = { d['mRNA'][i] : d['AA'][i] for i in range(len(d['AA'])) }
s=0
found = False
for s in (0,1,2):
# t3 = codon sequence starting from s
t = list(xmRNA[s:])
t3 = [ t[i]+t[i+1]+t[i+2] for i in range(0,len(t)-2,3) ]
# transcribe to AAs
aa = [ r2a[i] for i in t3 ]
aa = ''.join(aa)
print (aa)
try:
idx = aa.index(xAA)
except ValueError:
continue
# found it
pos = idx*3 + s
t3 = t3[idx:idx+len(xAA)]
found = True
break
if found:
print ("found in frame {} at {} (pos={})".format(s+1, idx, pos))
print ("codons: " + repr(t3))
else:
print ("Not found")
这是一个相当紧凑的解决方案。当你创建一个反向字典时你没有任何冲突,所以你可以这样做:
codon_dict = {'F': ('UUU', 'UUC'),
'L': ('UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'),
'S': ('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'),
'Y': ('UAU', 'UAC'),
'_': ('UAA', 'UAG', 'UGA'),
'C': ('UGU', 'UGC'),
'W': ('UGG',),
'P': ('CCU', 'CCC', 'CCA', 'CCG'),
'H': ('CAU', 'CAC'),
'Q': ('CAA', 'CAG'),
'R': ('CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'),
'I': ('AUU', 'AUC', 'AUA'),
'M': ('AUG',),
'T': ('ACU', 'ACC', 'ACA', 'ACG'),
'N': ('AAU', 'AAC'),
'K': ('AAA', 'AAG'),
'V': ('GUU', 'GUC', 'GUA', 'GUG'),
'A': ('GCU', 'GCC', 'GCA', 'GCG'),
'D': ('GAU', 'GAC'),
'E': ('GAA', 'GAG'),
'G': ('GGU', 'GGC', 'GGA', 'GGG')
}
rna_dict = {}
for k, v in codon_dict.items():
for val in v:
rna_dict[val] = k
xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'
mapped = [rna_dict[x] for x in [xmRNA[i:i+3] for i in range(0, len(xmRNA)-1, 3)]]
xmRNA_index = "".join(mapped).find(xAA) * 3
print(xmRNA_index)
这个returns 486,这是xAA密码子所在的位置。它可以通过对字符串进行边界检查来进一步充实,如果您只想要最接近的匹配,您可以对子字符串进行组合匹配(我仍然会进行反向映射,然后在 space 中搜索,但速度更快),但是是的。我也不知道不映射的缺失 characters/triplets 有多常见,因此可能需要针对这些情况进行修改。