如何编辑 Python 脚本?
How to edit a Python script?
我在尝试编辑有效的 python 脚本时遇到困难。
我有 2 个文件:
- 包含 ID 的 .txt 文件
- 一个包含 Fasta 序列及其 ID 的 .fasta 文件。
此脚本的目的是比较这两个文件,一旦第一个文件的 ID 与第二个文件的序列及其 ID 匹配,输出应该是 ID、完整序列及其 ID。
我这里的脚本获取第一个文件的 ID 和没有 ID 的序列文本作为输出。
enter image description here
这是脚本:
with open('uniprot_reviewed_taxonomy_9606.fasta', 'r') as f:
for line in f.readlines():
line = line.replace("\n", "")
if line.startswith('>'):
full_name = line.split('|')
accession_x = full_name[1]
print(accession_x)
else:
print (line)
with open('homosapiens_output1.txt', 'r') as f1:
for line1 in f1.readlines()[1:]: # ignores the first line
line1 = line1.replace("\n", "")
full_name1 = line1.split(' ')
accession_y = full_name1[0].replace(" ", "")
accession_z = full_name1[1].replace(" ", "")
main_accession = accession_x + " " + accession_z + " " + accession_y + " " + line
if accession_x == accession_z:
print(main_accession)
所以你能帮我编辑脚本吗所以输出可以是 ID、Fasta 序列及其 ID 吗?
这是一个小测试示例,说明如何做到这一点。
代码:
# create test fasta:
test_content = """>tr|Q53XC5|Q53XC5_HUMAN Bone morphogenetic protein 4 OS=Homo sapiens OX=9606 GN=BMP4 PE=2 SV=1
MIPGNRMLMVVLLCQVLLGGASHASLIPETGKKKVAEIQGHAGGRRSGQSHELLRDFEAT
LLQMFGLRRRPQPSKSAVIPDYMRDLYRLQSGEEEEEQIHSTGLEYPERPASRANTVRSF
HHEEHLENIPGTSENSAFRFLFNLSSIPENEVISSAELRLFREQVDQGPDWERGFHRINI
YEVMKPPAEVVPGHLITRLLDTRLVHHNVTRWETFDVSPAVLRWTREKQPNYGLAIEVTH
LHQTRTHQGQHVRISRSLPQGSGNWAQLRPLLVTFGHDGRGHALTRRRRAKRSPKHHSQR
ARKKNKNCRRHSLYVDFSDVGWNDWIVAPPGYQAFYCHGDCPFPLADHLNSTNHAIVQTL
VNSVNSSIPKACCVPTELSAISMLYLDEYDKVVLKNYQEMVVEGCGCR
>tr|A8K571|A8K571_HUMAN Bone morphogenetic protein 7 (Osteogenic protein 1), isoform CRA_b OS=Homo sapiens OX=9606 GN=BMP7 PE=2 SV=1
MHVRSLRAAAPHSFVALWAPLFLLRSALADFSLDNEVHSSFIHRRLRSQERREMQREILS
ILGLPHRPRPHLQGKHNSAPMFMLDLYNAMAVEEGGGPGGQGFSYPYKAVFSTQGPPLAS
LQDSHFLTDADMVMSFVNLVEHDKEFFHPRYHHREFRFDLSKIPEGEAVTAAEFRIYKDY
IRERFDNETFRISVYQVLQEHLGRESDLFLLDSRTLWASEEGWLVFDITATSNHWVVNPR
HNLGLQLSVETLDGQSINPKLAGLIGRHGPQNKQPFMVAFFKATEVHFRSIRSTGSKQRS
QNRSKTPKNQEALRMANVAENSSSDQRQACKKHELYVSFRDLGWQDWIIAPEGYAAYYCE
GECAFPLNSYMNATNHAIVQTLVHFINPETVPKPCCAPTQLNAISVLYFDDSSNVILKKY
RNMVVRACGCH
>tr|A8K660|A8K660_HUMAN Adiponectin OS=Homo sapiens OX=9606 GN=ADIPOQ PE=2 SV=1
MLLLGAVLLLLALPGHDQETTTQGPGVLLPLPKGACTGWMAGIPGHPGHNGAPGRDGRDG
TPGEKGEKGDPGLIGPKGDIGETGVPGAEGPRGFPGIQGRKGEPGEGAYVYRSAFSVGLE
TYVTIPNMPIRFTKIFYNQQNHYDGSTGKFHCNIPGLYYFAYHITVYMKDVKVSLFKKDK
AMLFTYDQYQENNVDQASGSVLLHLEVGDQVWLQVYGEGERNGLYADNDNDSTFTGFLLY
HDTN
"""
with open('test_sequences.fasta', 'w') as f:
f.write(test_content)
# create test ids:
test_ids = 'A8K660\nQ53XC5\n'
with open('test_ids.txt', 'w') as f:
f.write(test_ids)
# Load all sequences and store them in dict:
with open('test_sequences.fasta', 'r') as f:
lines = f.read().split('>')
sequences = {}
for seq in lines:
if seq:
id_ = seq[: seq.find('\n')].split('|')[1]
seq = seq[seq.find('\n')+1:]
sequences[id_] = seq
# import ids:
with open('test_ids.txt', 'r') as f:
ids = f.readlines()
ids = [id_.strip() for id_ in ids] # remove \n from id end
# checking ids
# and filter out those that are not in sequences dict
filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]
# writing new file with filtered sequences:
with open('filtered_ids.txt', 'w') as f:
for id_ in filtered_ids:
f.write('>|' + id_ + '\n')
f.write(sequences[id_])
# the final function:
def ids_filter(ids_file, seq_file, out_file):
with open(seq_file, 'r') as f:
lines = f.read().split('>')
sequences = {}
for seq in lines:
if seq:
id_ = seq[: seq.find('\n')].split('|')[1]
seq = seq[seq.find('\n')+1:]
sequences[id_] = seq
with open(ids_file, 'r') as f:
ids = f.readlines()
ids = [id_.strip() for id_ in ids]
filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]
with open(out_file, 'w') as f:
for id_ in filtered_ids:
f.write('>|' + id_ + '\n')
f.write(sequences[id_])
我在尝试编辑有效的 python 脚本时遇到困难。
我有 2 个文件:
- 包含 ID 的 .txt 文件
- 一个包含 Fasta 序列及其 ID 的 .fasta 文件。
此脚本的目的是比较这两个文件,一旦第一个文件的 ID 与第二个文件的序列及其 ID 匹配,输出应该是 ID、完整序列及其 ID。
我这里的脚本获取第一个文件的 ID 和没有 ID 的序列文本作为输出。
enter image description here
这是脚本:
with open('uniprot_reviewed_taxonomy_9606.fasta', 'r') as f:
for line in f.readlines():
line = line.replace("\n", "")
if line.startswith('>'):
full_name = line.split('|')
accession_x = full_name[1]
print(accession_x)
else:
print (line)
with open('homosapiens_output1.txt', 'r') as f1:
for line1 in f1.readlines()[1:]: # ignores the first line
line1 = line1.replace("\n", "")
full_name1 = line1.split(' ')
accession_y = full_name1[0].replace(" ", "")
accession_z = full_name1[1].replace(" ", "")
main_accession = accession_x + " " + accession_z + " " + accession_y + " " + line
if accession_x == accession_z:
print(main_accession)
所以你能帮我编辑脚本吗所以输出可以是 ID、Fasta 序列及其 ID 吗?
这是一个小测试示例,说明如何做到这一点。
代码:
# create test fasta:
test_content = """>tr|Q53XC5|Q53XC5_HUMAN Bone morphogenetic protein 4 OS=Homo sapiens OX=9606 GN=BMP4 PE=2 SV=1
MIPGNRMLMVVLLCQVLLGGASHASLIPETGKKKVAEIQGHAGGRRSGQSHELLRDFEAT
LLQMFGLRRRPQPSKSAVIPDYMRDLYRLQSGEEEEEQIHSTGLEYPERPASRANTVRSF
HHEEHLENIPGTSENSAFRFLFNLSSIPENEVISSAELRLFREQVDQGPDWERGFHRINI
YEVMKPPAEVVPGHLITRLLDTRLVHHNVTRWETFDVSPAVLRWTREKQPNYGLAIEVTH
LHQTRTHQGQHVRISRSLPQGSGNWAQLRPLLVTFGHDGRGHALTRRRRAKRSPKHHSQR
ARKKNKNCRRHSLYVDFSDVGWNDWIVAPPGYQAFYCHGDCPFPLADHLNSTNHAIVQTL
VNSVNSSIPKACCVPTELSAISMLYLDEYDKVVLKNYQEMVVEGCGCR
>tr|A8K571|A8K571_HUMAN Bone morphogenetic protein 7 (Osteogenic protein 1), isoform CRA_b OS=Homo sapiens OX=9606 GN=BMP7 PE=2 SV=1
MHVRSLRAAAPHSFVALWAPLFLLRSALADFSLDNEVHSSFIHRRLRSQERREMQREILS
ILGLPHRPRPHLQGKHNSAPMFMLDLYNAMAVEEGGGPGGQGFSYPYKAVFSTQGPPLAS
LQDSHFLTDADMVMSFVNLVEHDKEFFHPRYHHREFRFDLSKIPEGEAVTAAEFRIYKDY
IRERFDNETFRISVYQVLQEHLGRESDLFLLDSRTLWASEEGWLVFDITATSNHWVVNPR
HNLGLQLSVETLDGQSINPKLAGLIGRHGPQNKQPFMVAFFKATEVHFRSIRSTGSKQRS
QNRSKTPKNQEALRMANVAENSSSDQRQACKKHELYVSFRDLGWQDWIIAPEGYAAYYCE
GECAFPLNSYMNATNHAIVQTLVHFINPETVPKPCCAPTQLNAISVLYFDDSSNVILKKY
RNMVVRACGCH
>tr|A8K660|A8K660_HUMAN Adiponectin OS=Homo sapiens OX=9606 GN=ADIPOQ PE=2 SV=1
MLLLGAVLLLLALPGHDQETTTQGPGVLLPLPKGACTGWMAGIPGHPGHNGAPGRDGRDG
TPGEKGEKGDPGLIGPKGDIGETGVPGAEGPRGFPGIQGRKGEPGEGAYVYRSAFSVGLE
TYVTIPNMPIRFTKIFYNQQNHYDGSTGKFHCNIPGLYYFAYHITVYMKDVKVSLFKKDK
AMLFTYDQYQENNVDQASGSVLLHLEVGDQVWLQVYGEGERNGLYADNDNDSTFTGFLLY
HDTN
"""
with open('test_sequences.fasta', 'w') as f:
f.write(test_content)
# create test ids:
test_ids = 'A8K660\nQ53XC5\n'
with open('test_ids.txt', 'w') as f:
f.write(test_ids)
# Load all sequences and store them in dict:
with open('test_sequences.fasta', 'r') as f:
lines = f.read().split('>')
sequences = {}
for seq in lines:
if seq:
id_ = seq[: seq.find('\n')].split('|')[1]
seq = seq[seq.find('\n')+1:]
sequences[id_] = seq
# import ids:
with open('test_ids.txt', 'r') as f:
ids = f.readlines()
ids = [id_.strip() for id_ in ids] # remove \n from id end
# checking ids
# and filter out those that are not in sequences dict
filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]
# writing new file with filtered sequences:
with open('filtered_ids.txt', 'w') as f:
for id_ in filtered_ids:
f.write('>|' + id_ + '\n')
f.write(sequences[id_])
# the final function:
def ids_filter(ids_file, seq_file, out_file):
with open(seq_file, 'r') as f:
lines = f.read().split('>')
sequences = {}
for seq in lines:
if seq:
id_ = seq[: seq.find('\n')].split('|')[1]
seq = seq[seq.find('\n')+1:]
sequences[id_] = seq
with open(ids_file, 'r') as f:
ids = f.readlines()
ids = [id_.strip() for id_ in ids]
filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]
with open(out_file, 'w') as f:
for id_ in filtered_ids:
f.write('>|' + id_ + '\n')
f.write(sequences[id_])