如何编辑 Python 脚本?

How to edit a Python script?

我在尝试编辑有效的 python 脚本时遇到困难。

我有 2 个文件:

  1. 包含 ID 的 .txt 文件
  2. 一个包含 Fasta 序列及其 ID 的 .fasta 文件。

此脚本的目的是比较这两个文件,一旦第一个文件的 ID 与第二个文件的序列及其 ID 匹配,输出应该是 ID、完整序列及其 ID。

我这里的脚本获取第一个文件的 ID 和没有 ID 的序列文本作为输出。

enter image description here

这是脚本:

with open('uniprot_reviewed_taxonomy_9606.fasta', 'r') as f:
    for line in f.readlines():
        line = line.replace("\n", "")
        if line.startswith('>'):
            full_name = line.split('|')
            accession_x = full_name[1]
            print(accession_x)
        else:
           print (line)

            with open('homosapiens_output1.txt', 'r') as f1:
                for line1 in f1.readlines()[1:]:  # ignores the first line
                    line1 = line1.replace("\n", "")

                    full_name1 = line1.split(' ')
                    accession_y = full_name1[0].replace(" ", "")
                    accession_z = full_name1[1].replace(" ", "")
                    main_accession = accession_x + " " + accession_z + " " + accession_y + " " + line

                    if accession_x == accession_z:
                        print(main_accession)

所以你能帮我编辑脚本吗所以输出可以是 ID、Fasta 序列及其 ID 吗?

这是一个小测试示例,说明如何做到这一点。

代码:

# create test fasta:
test_content = """>tr|Q53XC5|Q53XC5_HUMAN Bone morphogenetic protein 4 OS=Homo sapiens OX=9606 GN=BMP4 PE=2 SV=1
MIPGNRMLMVVLLCQVLLGGASHASLIPETGKKKVAEIQGHAGGRRSGQSHELLRDFEAT
LLQMFGLRRRPQPSKSAVIPDYMRDLYRLQSGEEEEEQIHSTGLEYPERPASRANTVRSF
HHEEHLENIPGTSENSAFRFLFNLSSIPENEVISSAELRLFREQVDQGPDWERGFHRINI
YEVMKPPAEVVPGHLITRLLDTRLVHHNVTRWETFDVSPAVLRWTREKQPNYGLAIEVTH
LHQTRTHQGQHVRISRSLPQGSGNWAQLRPLLVTFGHDGRGHALTRRRRAKRSPKHHSQR
ARKKNKNCRRHSLYVDFSDVGWNDWIVAPPGYQAFYCHGDCPFPLADHLNSTNHAIVQTL
VNSVNSSIPKACCVPTELSAISMLYLDEYDKVVLKNYQEMVVEGCGCR
>tr|A8K571|A8K571_HUMAN Bone morphogenetic protein 7 (Osteogenic protein 1), isoform CRA_b OS=Homo sapiens OX=9606 GN=BMP7 PE=2 SV=1
MHVRSLRAAAPHSFVALWAPLFLLRSALADFSLDNEVHSSFIHRRLRSQERREMQREILS
ILGLPHRPRPHLQGKHNSAPMFMLDLYNAMAVEEGGGPGGQGFSYPYKAVFSTQGPPLAS
LQDSHFLTDADMVMSFVNLVEHDKEFFHPRYHHREFRFDLSKIPEGEAVTAAEFRIYKDY
IRERFDNETFRISVYQVLQEHLGRESDLFLLDSRTLWASEEGWLVFDITATSNHWVVNPR
HNLGLQLSVETLDGQSINPKLAGLIGRHGPQNKQPFMVAFFKATEVHFRSIRSTGSKQRS
QNRSKTPKNQEALRMANVAENSSSDQRQACKKHELYVSFRDLGWQDWIIAPEGYAAYYCE
GECAFPLNSYMNATNHAIVQTLVHFINPETVPKPCCAPTQLNAISVLYFDDSSNVILKKY
RNMVVRACGCH
>tr|A8K660|A8K660_HUMAN Adiponectin OS=Homo sapiens OX=9606 GN=ADIPOQ PE=2 SV=1
MLLLGAVLLLLALPGHDQETTTQGPGVLLPLPKGACTGWMAGIPGHPGHNGAPGRDGRDG
TPGEKGEKGDPGLIGPKGDIGETGVPGAEGPRGFPGIQGRKGEPGEGAYVYRSAFSVGLE
TYVTIPNMPIRFTKIFYNQQNHYDGSTGKFHCNIPGLYYFAYHITVYMKDVKVSLFKKDK
AMLFTYDQYQENNVDQASGSVLLHLEVGDQVWLQVYGEGERNGLYADNDNDSTFTGFLLY
HDTN
"""

with open('test_sequences.fasta', 'w') as f:
    f.write(test_content)

# create test ids:
test_ids = 'A8K660\nQ53XC5\n'

with open('test_ids.txt', 'w') as f:
    f.write(test_ids)

# Load all sequences and store them in dict:

with open('test_sequences.fasta', 'r') as f:
    lines = f.read().split('>')

sequences = {}
for seq in lines:
    if seq:
        id_ = seq[: seq.find('\n')].split('|')[1]
        seq = seq[seq.find('\n')+1:]
        sequences[id_] = seq

# import ids:
with open('test_ids.txt', 'r') as f:
    ids = f.readlines()
    ids = [id_.strip() for id_ in ids]  # remove \n from id end

# checking ids
# and filter out those that are not in sequences dict

filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]

# writing new file with filtered sequences:

with open('filtered_ids.txt', 'w') as f:
    for id_ in filtered_ids:
        f.write('>|' + id_ + '\n')
        f.write(sequences[id_])

# the final function:

def ids_filter(ids_file, seq_file, out_file):
    with open(seq_file, 'r') as f:
        lines = f.read().split('>')

    sequences = {}
    for seq in lines:
        if seq:
            id_ = seq[: seq.find('\n')].split('|')[1]
            seq = seq[seq.find('\n')+1:]
            sequences[id_] = seq

    with open(ids_file, 'r') as f:
        ids = f.readlines()
        ids = [id_.strip() for id_ in ids]

    filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]

    with open(out_file, 'w') as f:
        for id_ in filtered_ids:
            f.write('>|' + id_ + '\n')
            f.write(sequences[id_])