输出是一个空文件
Output is an empty file
我的代码没有抛出错误,它只是创建文件,但其中的文件是空的。我从命令行尝试了它,它使用通配符 training_set_pssm/*.pssm 路径工作,但我必须从 IDE 执行它,因为它无论如何都没有打印正确的输出。
输入文件是一组检查点文件,如下所示:
从这个文件,它是一个文本文件,保存为 .pssm,本质上,我只提取 PROFILE 端,它在右边并同时对其进行规范化......我的代码似乎没有正确地做到这一点,并且从 IDE 它根本不这样做,所以我不确定此时我需要在脚本中修改什么才能做到这一点。
代码如下:
#!/usr/bin/env python3
import sys
import os.path
from pathlib import Path
def pssm_list(infile): # call list of file names and for dsspfile
''' Reads relevant lines from a pssm file and saves them to a list.
Returns values of the 2 matrices (no header).'''
with open(infile) as ofile:
flist = ofile.readlines()[3:-6] # list of each line of the file excluding first 3 & last 6 lines
return flist
def lines_to_list(infile1):
''' Reads all lines from a file and saves them to a list containing the '\n' char. '''
all_lines_list = []
with open(infile1, 'r') as rfile:
all_lines_list = rfile.readlines()
return all_lines_list # need to rstrip in a loop for using filenames.
def relevant_lines(infile2):
'''Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
Returns a list of list where each element is one line of the sequence profile matrix. '''
pssm_profile_list = pssm_list(infile2) # contains all lines from the pssm file.
profile_final_list = [] # for holding relevant fields of the lines
for line in pssm_profile_list:
#print(line)
pssm_profile_list = line.split()[22:42] # profile ranges from pos 22-42
profile_final_list.append(pssm_profile_list) # appending to final list of lists
return profile_final_list # list of lists
# # divide all values by 100
def write_normalized_profile(profile_final_list, ofile):
'''Takes profile list of lists and outfile name as input. Writes each number that is in
one of the sublists and devides it by 100. The number is converted to a string and added
a tab and written to a file. After each sublist a newline character is written to the file.'''
with open(ofile, "a") as wfile:
for sublist in profile_final_list:
# print(sublist)
for el in sublist:
num = int(el) / 100
numstring = str(num)
wfile.write(numstring + '\t') # adding tab after each number
wfile.write("\n") # adding newline at the end of each sublist.
#print(sublist)
#print(numstring)
if __name__ == '__main__':
# infile = sys.argv[1]
infile = ('/Users/name/Desktop/PDB/training_set_pssm/idlist/') # the idlist to loop on
#print(infile)
# Call the function by looping through an id list+'.pssm' extension
# name the outfile the same --> id list+'.profile'
idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist") # containing the id of the file but NOT the extension ".pssm"
#print(idlist)
for ids in idlist:
#print(ids)
part2 = ids.rstrip() + '.pssm' # removing newlinecharacter, adding necessary extension
#print(part2)
if os.path.isfile(infile) == True: # does this file exist
ofile = ids.rstrip() + '.profile' # outfile for each id with correct extension
#print(ofile)
profile_list = relevant_lines(infile)
#print(profile_list)
write_normalized_profile(profile_list, ofile)
#print(write_normalized_profile)
#print(profile_list)
else:
print("Error file: " + infile + " not found.")
首先让我们修正你的路径,你导入了 from pathlib import Path
但从未使用过它。
让我们声明 infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/')
,我们现在有一些有用的函数可以用来发现问题。
尝试其中一些以确保您在正确的位置进行搜索。
#this will write out the absolute filepath usefull to check if it is correct
infile.absolute()
#this tells you if this path exists
infile.exists()
#this tells you if this is a file
infile.is_file()
让我们从头开始
我将尝试逐行解释您的代码中发生的事情。
if __name__ == '__main__':
# i don't really know what this infile is, is it a file containing
# d1s7za_.fasta.pssm
# d1s98a_.fasta.pssm
# d1s99a_.fasta.pssm
#or a directory containing files named
#d1s7za_.fasta.pssm
#d1s98a_.fasta.pssm
#d1s99a_.fasta.pssm
#...
infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist')
# this returns a list of string presumably in the form of
# d1ciya2.fasta\n
# d1ciya3.fasta\n
# d1cq3a_.fasta\n
idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")
# loop over that list
for ids in idlist:
# strips the '\n' from the id and adds '.pssm'
# you now have something like 'd1d0qa_.fasta.pssm'
# you never use this
part2 = ids.rstrip() + '.pssm'
# was 'if os.path.isfile(infile) == True:' but should be :
if infile.is_file():
# strips the '\n' from the id and adds '.profile'
# you now have something like 'd1d0qa_.fasta.profile'
ofile = ids.rstrip() + '.profile'
# here is where it becomes a bit weird
# in relevant_lines you say:
# Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
# is infile a .pssm file?
# is this correct?
profile_list = relevant_lines(infile)
# this seems fine, it writes the normalized data to ofile.
# ofile will be something like 'd1d0qa_.fasta.profile'
write_normalized_profile(profile_list, ofile)
解决方案:
if __name__ == '__main__':
pssm_directory = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/') #the directory
idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")
for ids in idlist:
infile = pssm_directory.joinpath(ids.rstrip() + '.pssm') #generate filename from id
if infile.is_file(): #check if filename exists
ofile = ids.rstrip() + '.profile'
profile_list = relevant_lines(infile)
write_normalized_profile(profile_list, ofile)
if __name__ == '__main__':
#infile is a directory containing files named
#d1s7za_.fasta.pssm
#d1s98a_.fasta.pssm
#d1s99a_.fasta.pssm
#...
infile = Path('/Users/name/Desktop/PDB/training_set_pssm/')
# this returns a list of string presumably in the form of
# d1ciya2.fasta\n
# d1ciya3.fasta\n
# d1cq3a_.fasta\n
idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")
# loop over that list
for ids in idlist:
# strips the '\n' from the id and adds '.pssm'
# you now have something like 'd1d0qa_.fasta.pssm' **exactly, now it matches with the file to be extracted from the directory**
# you never use this
part2 = ids.rstrip() + '.pssm' **I think this should have been concat to the infile..**
# was 'if os.path.isfile(infile) == True:' but should be :
if infile.is_file():
# strips the '\n' from the id and adds '.profile'
# you now have something like 'd1d0qa_.fasta.profile'
ofile = ids.rstrip() + '.profile' **yes these will be the output file names**
# here is where it becomes a bit weird
# in relevant_lines you say:
# Takes list or lines of data (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
# is infile a .pssm file? **yes it is, it is the actual datafile from the directory, well should be**
# is this correct?
profile_list = relevant_lines(infile)
# this seems fine, it writes the normalized data to ofile.
# ofile will be something like 'd1d0qa_.fasta.profile'
write_normalized_profile(profile_list, ofile) ``` **yes**
我的代码没有抛出错误,它只是创建文件,但其中的文件是空的。我从命令行尝试了它,它使用通配符 training_set_pssm/*.pssm 路径工作,但我必须从 IDE 执行它,因为它无论如何都没有打印正确的输出。
输入文件是一组检查点文件,如下所示:
从这个文件,它是一个文本文件,保存为 .pssm,本质上,我只提取 PROFILE 端,它在右边并同时对其进行规范化......我的代码似乎没有正确地做到这一点,并且从 IDE 它根本不这样做,所以我不确定此时我需要在脚本中修改什么才能做到这一点。
代码如下:
#!/usr/bin/env python3
import sys
import os.path
from pathlib import Path
def pssm_list(infile): # call list of file names and for dsspfile
''' Reads relevant lines from a pssm file and saves them to a list.
Returns values of the 2 matrices (no header).'''
with open(infile) as ofile:
flist = ofile.readlines()[3:-6] # list of each line of the file excluding first 3 & last 6 lines
return flist
def lines_to_list(infile1):
''' Reads all lines from a file and saves them to a list containing the '\n' char. '''
all_lines_list = []
with open(infile1, 'r') as rfile:
all_lines_list = rfile.readlines()
return all_lines_list # need to rstrip in a loop for using filenames.
def relevant_lines(infile2):
'''Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
Returns a list of list where each element is one line of the sequence profile matrix. '''
pssm_profile_list = pssm_list(infile2) # contains all lines from the pssm file.
profile_final_list = [] # for holding relevant fields of the lines
for line in pssm_profile_list:
#print(line)
pssm_profile_list = line.split()[22:42] # profile ranges from pos 22-42
profile_final_list.append(pssm_profile_list) # appending to final list of lists
return profile_final_list # list of lists
# # divide all values by 100
def write_normalized_profile(profile_final_list, ofile):
'''Takes profile list of lists and outfile name as input. Writes each number that is in
one of the sublists and devides it by 100. The number is converted to a string and added
a tab and written to a file. After each sublist a newline character is written to the file.'''
with open(ofile, "a") as wfile:
for sublist in profile_final_list:
# print(sublist)
for el in sublist:
num = int(el) / 100
numstring = str(num)
wfile.write(numstring + '\t') # adding tab after each number
wfile.write("\n") # adding newline at the end of each sublist.
#print(sublist)
#print(numstring)
if __name__ == '__main__':
# infile = sys.argv[1]
infile = ('/Users/name/Desktop/PDB/training_set_pssm/idlist/') # the idlist to loop on
#print(infile)
# Call the function by looping through an id list+'.pssm' extension
# name the outfile the same --> id list+'.profile'
idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist") # containing the id of the file but NOT the extension ".pssm"
#print(idlist)
for ids in idlist:
#print(ids)
part2 = ids.rstrip() + '.pssm' # removing newlinecharacter, adding necessary extension
#print(part2)
if os.path.isfile(infile) == True: # does this file exist
ofile = ids.rstrip() + '.profile' # outfile for each id with correct extension
#print(ofile)
profile_list = relevant_lines(infile)
#print(profile_list)
write_normalized_profile(profile_list, ofile)
#print(write_normalized_profile)
#print(profile_list)
else:
print("Error file: " + infile + " not found.")
首先让我们修正你的路径,你导入了 from pathlib import Path
但从未使用过它。
让我们声明 infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/')
,我们现在有一些有用的函数可以用来发现问题。
尝试其中一些以确保您在正确的位置进行搜索。
#this will write out the absolute filepath usefull to check if it is correct
infile.absolute()
#this tells you if this path exists
infile.exists()
#this tells you if this is a file
infile.is_file()
让我们从头开始 我将尝试逐行解释您的代码中发生的事情。
if __name__ == '__main__':
# i don't really know what this infile is, is it a file containing
# d1s7za_.fasta.pssm
# d1s98a_.fasta.pssm
# d1s99a_.fasta.pssm
#or a directory containing files named
#d1s7za_.fasta.pssm
#d1s98a_.fasta.pssm
#d1s99a_.fasta.pssm
#...
infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist')
# this returns a list of string presumably in the form of
# d1ciya2.fasta\n
# d1ciya3.fasta\n
# d1cq3a_.fasta\n
idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")
# loop over that list
for ids in idlist:
# strips the '\n' from the id and adds '.pssm'
# you now have something like 'd1d0qa_.fasta.pssm'
# you never use this
part2 = ids.rstrip() + '.pssm'
# was 'if os.path.isfile(infile) == True:' but should be :
if infile.is_file():
# strips the '\n' from the id and adds '.profile'
# you now have something like 'd1d0qa_.fasta.profile'
ofile = ids.rstrip() + '.profile'
# here is where it becomes a bit weird
# in relevant_lines you say:
# Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
# is infile a .pssm file?
# is this correct?
profile_list = relevant_lines(infile)
# this seems fine, it writes the normalized data to ofile.
# ofile will be something like 'd1d0qa_.fasta.profile'
write_normalized_profile(profile_list, ofile)
解决方案:
if __name__ == '__main__':
pssm_directory = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/') #the directory
idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")
for ids in idlist:
infile = pssm_directory.joinpath(ids.rstrip() + '.pssm') #generate filename from id
if infile.is_file(): #check if filename exists
ofile = ids.rstrip() + '.profile'
profile_list = relevant_lines(infile)
write_normalized_profile(profile_list, ofile)
if __name__ == '__main__':
#infile is a directory containing files named
#d1s7za_.fasta.pssm
#d1s98a_.fasta.pssm
#d1s99a_.fasta.pssm
#...
infile = Path('/Users/name/Desktop/PDB/training_set_pssm/')
# this returns a list of string presumably in the form of
# d1ciya2.fasta\n
# d1ciya3.fasta\n
# d1cq3a_.fasta\n
idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")
# loop over that list
for ids in idlist:
# strips the '\n' from the id and adds '.pssm'
# you now have something like 'd1d0qa_.fasta.pssm' **exactly, now it matches with the file to be extracted from the directory**
# you never use this
part2 = ids.rstrip() + '.pssm' **I think this should have been concat to the infile..**
# was 'if os.path.isfile(infile) == True:' but should be :
if infile.is_file():
# strips the '\n' from the id and adds '.profile'
# you now have something like 'd1d0qa_.fasta.profile'
ofile = ids.rstrip() + '.profile' **yes these will be the output file names**
# here is where it becomes a bit weird
# in relevant_lines you say:
# Takes list or lines of data (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
# is infile a .pssm file? **yes it is, it is the actual datafile from the directory, well should be**
# is this correct?
profile_list = relevant_lines(infile)
# this seems fine, it writes the normalized data to ofile.
# ofile will be something like 'd1d0qa_.fasta.profile'
write_normalized_profile(profile_list, ofile) ``` **yes**