从另一个位置具有基因 ID 的 fasta 文件中提取 DNA 序列
to extract dna sequence from a fasta file with gene ids in another location
我创建了一个小程序来从 fasta 文件中提取选定的 ids + 序列。感兴趣的 id 是包含该基因的多个序列的文件名。这是节目:
import glob, sys, os
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print """
usage: to_extract_seq_and_id.py [-h] [-i input_file:path to data] [-r reference file: path_to_file ]
[-o output_directory: path_to_store_new_file] """
parser = argparse.ArgumentParser()
parser.add_argument('-input_files', '-i',type=str,help='path_to_data')
parser.add_argument('-reference', '-r',type=str,help='path_to_the_fasta_reference_file')
parser.add_argument('-output_directory','-o', type=str, help='path_to_store_new_file')
opts = parser.parse_args()
#first function to check if the files exits.
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def record_extraction(geneID,reference,output):
records=list(SeqIO.parse(opts.reference,'fasta'))
new_reference=output + '/new_reference_common_genes.fa'
output_handle=open(new_reference, 'a')
with open (opts.reference, 'rU') as input_handle:
for record in records:
recordID=record.id
if recordID == geneID:
SeqIO.write(record, output_handle, 'fasta')
else:
continue
return new_reference
def main():
if len(sys.argv) <=2:
parser.print_help()
sys.exit()
else:
check_file_exists(opts.input_files, 'input_files')
check_file_exists(opts.reference, 'reference_file')
check_file_exists(opts.output_directory, 'output_directory')
files=(glob.glob(opts.input_files + '/*.fa'))
for f in files:
database_files=glob.glob(f)[0]
file_name=os.path.basename(f)
gene_id=file_name.split('.')
gene_name=gene_id[0].split('_')
geneID=gene_name[1] + '_' + gene_name[2]
print 'The new reference fasta file has been create'
new_reference=record_extraction(geneID,opts.reference,opts.output_directory)
main()
我有一个包含 900 个基因的 fasta 文件,我想提取并写入另一个文件中的 700 个。程序运行正常,但只将一个基因及其序列写入新文件。我不明白为什么那个循环 运行 只有一次。最初有人可以帮我解决这个问题吗?
提前致谢。
问题有点不清楚,但也许这就是您要找的:
results = []
for record in records:
recordID=record.id #.split('_')
if recordID == geneID:
results.append(record)
else:
continue
SeqIO.write(" ".join(text for text in results), output_handle, 'fasta')
return new_reference
如果这不是您要找的。请详细说明您的问题和解决方案。
我遇到的问题是缩进问题。如果你看一下上面的代码,你会发现 def record_extraction()
没有在主函数的 for 循环 中调用 (def main() )。我已经更改了缩进,现在它确实可以正常工作。
见上面的新脚本:
import glob, sys, os
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print """
usage: to_extract_seq_and_id.py [-h] [-i input_file:path to data] [-r reference file: path_to_file ]
[-o output_directory: path_to_store_new_file] """
parser = argparse.ArgumentParser()
parser.add_argument('-input_files', '-i',type=str,help='path_to_data')
parser.add_argument('-reference', '-r',type=str,help='path_to_the_fasta_reference_file')
parser.add_argument('-output_directory','-o', type=str, help='path_to_store_new_file')
opts = parser.parse_args()
#first function to check if the files exits.
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def record_extraction(geneID,reference,output,genelist):
records=list(SeqIO.parse(opts.reference,'fasta'))
new_reference=output + '/new_reference_common_genes.fa'
output_handle=open(new_reference, 'a')
with open (opts.reference, 'rU') as input_handle:
for record in records:
recordid=record.id.split('_')
recordID=recordid[0]+'_'+recordid[1]
if recordID in genelist:
SeqIO.write(record, output_handle, 'fasta')
else:
continue
return new_reference
def main():
if len(sys.argv) <=2:
parser.print_help()
sys.exit()
else:
check_file_exists(opts.input_files, 'input_files')
check_file_exists(opts.reference, 'reference_file')
check_file_exists(opts.output_directory, 'output_directory')
#run the programme
files=(glob.glob(opts.input_files + '/*.fa'))
for f in files:
database_files=glob.glob(f)[0]
file_name=os.path.basename(f)
gene_id=file_name.split('.')
gene_name=gene_id[0].split('_')
geneID=gene_name[1]+'_'+gene_name[2]
genelist=[]
if geneID not in genelist:
genelist.append(geneID)
new_reference=record_extraction(geneID,opts.reference,opts.output_directory,genelist)
print 'The new reference fasta file has been create'
main()
我创建了一个小程序来从 fasta 文件中提取选定的 ids + 序列。感兴趣的 id 是包含该基因的多个序列的文件名。这是节目:
import glob, sys, os
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print """
usage: to_extract_seq_and_id.py [-h] [-i input_file:path to data] [-r reference file: path_to_file ]
[-o output_directory: path_to_store_new_file] """
parser = argparse.ArgumentParser()
parser.add_argument('-input_files', '-i',type=str,help='path_to_data')
parser.add_argument('-reference', '-r',type=str,help='path_to_the_fasta_reference_file')
parser.add_argument('-output_directory','-o', type=str, help='path_to_store_new_file')
opts = parser.parse_args()
#first function to check if the files exits.
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def record_extraction(geneID,reference,output):
records=list(SeqIO.parse(opts.reference,'fasta'))
new_reference=output + '/new_reference_common_genes.fa'
output_handle=open(new_reference, 'a')
with open (opts.reference, 'rU') as input_handle:
for record in records:
recordID=record.id
if recordID == geneID:
SeqIO.write(record, output_handle, 'fasta')
else:
continue
return new_reference
def main():
if len(sys.argv) <=2:
parser.print_help()
sys.exit()
else:
check_file_exists(opts.input_files, 'input_files')
check_file_exists(opts.reference, 'reference_file')
check_file_exists(opts.output_directory, 'output_directory')
files=(glob.glob(opts.input_files + '/*.fa'))
for f in files:
database_files=glob.glob(f)[0]
file_name=os.path.basename(f)
gene_id=file_name.split('.')
gene_name=gene_id[0].split('_')
geneID=gene_name[1] + '_' + gene_name[2]
print 'The new reference fasta file has been create'
new_reference=record_extraction(geneID,opts.reference,opts.output_directory)
main()
我有一个包含 900 个基因的 fasta 文件,我想提取并写入另一个文件中的 700 个。程序运行正常,但只将一个基因及其序列写入新文件。我不明白为什么那个循环 运行 只有一次。最初有人可以帮我解决这个问题吗? 提前致谢。
问题有点不清楚,但也许这就是您要找的:
results = []
for record in records:
recordID=record.id #.split('_')
if recordID == geneID:
results.append(record)
else:
continue
SeqIO.write(" ".join(text for text in results), output_handle, 'fasta')
return new_reference
如果这不是您要找的。请详细说明您的问题和解决方案。
我遇到的问题是缩进问题。如果你看一下上面的代码,你会发现 def record_extraction()
没有在主函数的 for 循环 中调用 (def main() )。我已经更改了缩进,现在它确实可以正常工作。
见上面的新脚本:
import glob, sys, os
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse
def help_function():
print """
usage: to_extract_seq_and_id.py [-h] [-i input_file:path to data] [-r reference file: path_to_file ]
[-o output_directory: path_to_store_new_file] """
parser = argparse.ArgumentParser()
parser.add_argument('-input_files', '-i',type=str,help='path_to_data')
parser.add_argument('-reference', '-r',type=str,help='path_to_the_fasta_reference_file')
parser.add_argument('-output_directory','-o', type=str, help='path_to_store_new_file')
opts = parser.parse_args()
#first function to check if the files exits.
def check_file_exists(filepath, file_description):
if not os.path.exists(filepath):
print("The " + file_description + " (" + filepath + ") does not exist")
sys.exit(1)
else:
print file_description + " detected"
def record_extraction(geneID,reference,output,genelist):
records=list(SeqIO.parse(opts.reference,'fasta'))
new_reference=output + '/new_reference_common_genes.fa'
output_handle=open(new_reference, 'a')
with open (opts.reference, 'rU') as input_handle:
for record in records:
recordid=record.id.split('_')
recordID=recordid[0]+'_'+recordid[1]
if recordID in genelist:
SeqIO.write(record, output_handle, 'fasta')
else:
continue
return new_reference
def main():
if len(sys.argv) <=2:
parser.print_help()
sys.exit()
else:
check_file_exists(opts.input_files, 'input_files')
check_file_exists(opts.reference, 'reference_file')
check_file_exists(opts.output_directory, 'output_directory')
#run the programme
files=(glob.glob(opts.input_files + '/*.fa'))
for f in files:
database_files=glob.glob(f)[0]
file_name=os.path.basename(f)
gene_id=file_name.split('.')
gene_name=gene_id[0].split('_')
geneID=gene_name[1]+'_'+gene_name[2]
genelist=[]
if geneID not in genelist:
genelist.append(geneID)
new_reference=record_extraction(geneID,opts.reference,opts.output_directory,genelist)
print 'The new reference fasta file has been create'
main()