输出是一个空文件

Output is an empty file

我的代码没有抛出错误,它只是创建文件,但其中的文件是空的。我从命令行尝试了它,它使用通配符 training_set_pssm/*.pssm 路径工作,但我必须从 IDE 执行它,因为它无论如何都没有打印正确的输出。
输入文件是一组检查点文件,如下所示:

从这个文件,它是一个文本文件,保存为 .pssm,本质上,我只提取 PROFILE 端,它在右边并同时对其进行规范化......我的代码似乎没有正确地做到这一点,并且从 IDE 它根本不这样做,所以我不确定此时我需要在脚本中修改什么才能做到这一点。

代码如下:

#!/usr/bin/env python3
import sys
import os.path
from pathlib import Path


def pssm_list(infile):  # call list of file names and for dsspfile
    ''' Reads relevant lines from a pssm file and saves them to a list.
    Returns values of the 2 matrices (no header).'''
    with open(infile) as ofile:
        flist = ofile.readlines()[3:-6]  # list of each line of the file excluding first 3 & last 6 lines
        return flist

def lines_to_list(infile1):
    ''' Reads all lines from a file and saves them to a list containing the '\n' char. '''
    all_lines_list = []
    with open(infile1, 'r') as rfile:
        all_lines_list = rfile.readlines()
    return all_lines_list  # need to rstrip in a loop for using filenames.

def relevant_lines(infile2):
    '''Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
    Returns a list of list where each element is one line of the sequence profile matrix. '''
    pssm_profile_list = pssm_list(infile2)  # contains all lines from the pssm file.
    profile_final_list = []  # for holding relevant fields of the lines
    for line in pssm_profile_list:
        #print(line)
        pssm_profile_list = line.split()[22:42]  # profile ranges from pos 22-42
        profile_final_list.append(pssm_profile_list)  # appending to final list of lists
    return profile_final_list  # list of lists


# # divide all values by 100
def write_normalized_profile(profile_final_list, ofile):
    '''Takes profile list of lists and outfile name as input. Writes each number that is in
    one of the sublists and devides it by 100. The number is converted to a string and added
    a tab and written to a file. After each sublist a newline character is written to the file.'''
    with open(ofile, "a") as wfile:
        for sublist in profile_final_list:
            #             print(sublist)
            for el in sublist:
                num = int(el) / 100
                numstring = str(num)
                wfile.write(numstring + '\t')  # adding tab after each number
            wfile.write("\n")  # adding newline at the end of each sublist.
            #print(sublist)
            #print(numstring)





if __name__ == '__main__':
    # infile = sys.argv[1]
    infile = ('/Users/name/Desktop/PDB/training_set_pssm/idlist/')  # the idlist to loop on
    #print(infile)
    # Call the function by looping through an id list+'.pssm' extension
    # name the outfile the same --> id list+'.profile'
    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")  # containing the id of the file but NOT the extension ".pssm"
    #print(idlist)
    for ids in idlist:
        #print(ids)
        part2 = ids.rstrip() + '.pssm'  # removing newlinecharacter, adding necessary extension
        #print(part2)
        if os.path.isfile(infile) == True:  # does this file exist
            ofile = ids.rstrip() + '.profile'  # outfile for each id with correct extension
            #print(ofile)
            profile_list = relevant_lines(infile)
            #print(profile_list)
            write_normalized_profile(profile_list, ofile)
            #print(write_normalized_profile)
            #print(profile_list)

        else:
            print("Error file: " + infile + " not found.")

首先让我们修正你的路径,你导入了 from pathlib import Path 但从未使用过它。

让我们声明 infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/'),我们现在有一些有用的函数可以用来发现问题。

尝试其中一些以确保您在正确的位置进行搜索。

#this will write out the absolute filepath usefull to check if it is correct
infile.absolute()

#this tells you if this path exists
infile.exists()

#this tells you if this is a file
infile.is_file()

让我们从头开始 我将尝试逐行解释您的代码中发生的事情。

if __name__ == '__main__':
    # i don't really know what this infile is, is it a file containing
    # d1s7za_.fasta.pssm 
    # d1s98a_.fasta.pssm 
    # d1s99a_.fasta.pssm 

    #or a directory containing files named
    #d1s7za_.fasta.pssm 
    #d1s98a_.fasta.pssm 
    #d1s99a_.fasta.pssm 
    #...
    infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist')

    # this returns a list of string presumably in the form of
    # d1ciya2.fasta\n
    # d1ciya3.fasta\n
    # d1cq3a_.fasta\n
    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")

   # loop over that list
   for ids in idlist:
        # strips the '\n' from the id and adds '.pssm'
        # you now have something like 'd1d0qa_.fasta.pssm'
        # you never use this
        part2 = ids.rstrip() + '.pssm'
        
        # was 'if os.path.isfile(infile) == True:' but should be :
        if infile.is_file():

            # strips the '\n' from the id and adds '.profile'
            # you now have something like 'd1d0qa_.fasta.profile'
            ofile = ids.rstrip() + '.profile'

            # here is where it becomes a bit weird
            # in relevant_lines you say:
            # Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
            # is infile a .pssm file?
            # is this correct?
            profile_list = relevant_lines(infile)

            # this seems fine, it writes the normalized data to ofile.
            # ofile will be something like 'd1d0qa_.fasta.profile'
            write_normalized_profile(profile_list, ofile)

解决方案:

if __name__ == '__main__':
    pssm_directory = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/') #the directory

    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")

    for ids in idlist:

        infile = pssm_directory.joinpath(ids.rstrip() + '.pssm') #generate filename from id
        if infile.is_file(): #check if filename exists

            ofile = ids.rstrip() + '.profile'

            profile_list = relevant_lines(infile)

            write_normalized_profile(profile_list, ofile)
if __name__ == '__main__':
     #infile is a directory containing files named
    #d1s7za_.fasta.pssm 
    #d1s98a_.fasta.pssm 
    #d1s99a_.fasta.pssm 
    #...
    infile = Path('/Users/name/Desktop/PDB/training_set_pssm/')

    # this returns a list of string presumably in the form of
    # d1ciya2.fasta\n
    # d1ciya3.fasta\n
    # d1cq3a_.fasta\n
    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")

   # loop over that list
   for ids in idlist:
        # strips the '\n' from the id and adds '.pssm'
        # you now have something like 'd1d0qa_.fasta.pssm' **exactly, now it matches with the file to be extracted from the directory**
        # you never use this
        part2 = ids.rstrip() + '.pssm' **I think this should have been concat to the infile..**
        
        # was 'if os.path.isfile(infile) == True:' but should be :
        if infile.is_file():

            # strips the '\n' from the id and adds '.profile'
            # you now have something like 'd1d0qa_.fasta.profile'
            ofile = ids.rstrip() + '.profile' **yes these will be the output file names** 

            # here is where it becomes a bit weird
            # in relevant_lines you say:
            # Takes list or lines of data (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
            # is infile a .pssm file? **yes it is, it is the actual datafile from the directory, well should be** 
            # is this correct?
            profile_list = relevant_lines(infile)

            # this seems fine, it writes the normalized data to ofile.
            # ofile will be something like 'd1d0qa_.fasta.profile'
            write_normalized_profile(profile_list, ofile) ``` **yes**