解析 txt 文件并根据列表提取特定部分
Parse a txt file and extract specific part depending on list
我需要帮助才能提取 txt.file 的一部分,例如
#some comments
#some comments
#some comments
# Predicted genes for sequence number 1 on both strands
# start gene g1
scaffold_0 AUGUSTUS gene 1268 6647 0.19 - . g1
scaffold_0 AUGUSTUS transcript 1268 6647 0.19 - . g1.t1
scaffold_0 AUGUSTUS stop_codon 1268 1270 . - 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS intron 1457 6004 0.62 - . transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS intron 6194 6509 0.42 - . transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS CDS 1268 1456 0.62 - 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS CDS 6005 6193 0.52 - 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS CDS 6510 6647 0.3 - 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS start_codon 6645 6647 . - 0 transcript_id "g1.t1"; gene_id "g1";
# protein sequence = [MDASLTLHRFRKRGAKRCGISDLKDIRNALRVGEIGPELLQSLATRGLLKKVDCNLVRIWSHMTRQKKKFFEIRVSYH
# HVRTTSKSKLLLVPDTNPDAALQKKMSTRLCTRYGYAITGQRGEIRKIRREMDGQSERQIDGRTHSRAGHLPTDKPSKGAKRCNFSISTFKTV]
# end gene g1
###
# start gene g2
scaffold_0 AUGUSTUS gene 10257 16732 0.03 - . g2
scaffold_0 AUGUSTUS transcript 10257 16732 0.03 - . g2.t1
scaffold_0 AUGUSTUS stop_codon 10257 10259 . - 0 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS intron 10476 12506 0.18 - . transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS intron 12667 15174 0.18 - . transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS intron 15183 16579 0.2 - . transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS CDS 10257 10475 0.59 - 0 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS CDS 12507 12666 0.48 - 1 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS CDS 15175 15182 0.21 - 0 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS CDS 16580 16732 0.34 - 0 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS start_codon 16730 16732 . - 0 transcript_id "g2.t1"; gene_id "g2";
# protein sequence = [MLAASRYSRRSRKISRVQHWDVNDSRDISACNSLSSEKAPSVTRREWHEMRLLSRSRDRFADCLRTVAARVRGPIKPS
# PCSGRQCETVRWRIENGPLLTDQLAVFAQVRLALASTTSGQHPSGDNVGNDRYATRLYLLTSRGTSLVHRGNELKFDRRRWTNLSTILVYANEILRCE
# EGW]
# end gene g2
###
# start gene g3
scaffold_0 AUGUSTUS gene 21625 24883 0.55 - . g3
scaffold_0 AUGUSTUS transcript 21625 24883 0.55 - . g3.t1
scaffold_0 AUGUSTUS stop_codon 21625 21627 . - 0 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS intron 21830 22738 0.68 - . transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS intron 22760 22851 0.61 - . transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS intron 22954 24665 0.89 - . transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS CDS 21625 21829 0.91 - 1 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS CDS 22739 22759 0.68 - 1 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS CDS 22852 22953 0.84 - 1 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS CDS 24666 24883 0.84 - 0 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS start_codon 24881 24883 . - 0 transcript_id "g3.t1"; gene_id "g3";
# protein sequence = [MEYSQIFSPKLQVLSTKERLLTGNRTNISLENTRLGASIELGNPIHKWKPVAEQGSLGIEPSDGSSSLHDLPLYFRTR
# CETSNASTSDRIRKLTIREGKGFDDSQMTCMLIVDKPPKKTLTTLDKEWKEISIRMTTKVISRCDMAANEEDEDSSESIYHAPFEDAPCTVRTSLNFD
# TSALE]
# end gene g3
###
所以首先我有一个列表,例如 list_gene_names=['g1','g3']
我想遍历该列表并提取蛋白质序列(在括号之间)并将它们放入名为的文件中:sequence_file.fa
到目前为止我试过了:
file_to_parse=open('file_to_parse.txt','w')
with open("sequence_file.fa",'a') as sequence_file_output:
for names in list_gene_names:
(then we should locate the part between '''# start gene names''' and '''# end gene names''')
and get the part '''# protein sequence =''' between brackets"
print('>',names,file=sequence_file_output,sep='')
print(part_between_brackets,file=sequence_file_output)
最后文件 sequence_file.fa
应该如下所示:
>g1
MDASLTLHRFRKRGAKRCGISDLKDIRNALRVGEIGPELLQSLATRGLLKKVDCNLVRIWSHMTRQKKKFFEIRVSYHHVRTTSKSKLLLVPDTNPDAALQKKMSTRLCTRYGYAITGQRGEIRKIRREMDGQSERQIDGRTHSRAGHLPTDKPSKGAKRCNFSISTFKTV
>g2
MEYSQIFSPKLQVLSTKERLLTGNRTNISLENTRLGASIELGNPIHKWKPVAEQGSLGIEPSDGSSSLHDLPLYFRTRCETSNASTSDRIRKLTIREGKGFDDSQMTCMLIVDKPPKKTLTTLDKEWKEISIRMTTKVISRCDMAANEEDEDSSESIYHAPFEDAPCTVRTSLNFDTSALE
感谢您的帮助和时间
您可以使用正则表达式(模块 re
)。您必须捕获 # protein sequence = [
之后的所有文本
试试这个代码(这才刚刚开始):
import re
regex = r"^# protein sequence = [([A-Z]+)$"
l = re.findall(regex, content_file, re.M)
正则表达式是一种方式:
import re
regex = r"# start gene(.+?)$[^[]*\[([^\]]*)\]"
with open("sequence_file.fa",'a') as sequence_file_output:
test_str = sequence_file_output.read()
matches = re.finditer(regex, test_str, re.MULTILINE | re.DOTALL)
with open("result.txt","w") as f:
for matchNum, match in enumerate(matches, start=1):
# restrict to g1,g2:
if match.group(1).strip() in "g1,g2":
for groupNum in range(1, len(match.groups())+1):
f.write(match.group(groupNum).replace("#",
"").replace(" ","").replace("\n",""))
f.write("\n")
with open("result.txt") as f:
print(f.read())
输出:
g1
MDASLTLHRFRKRGAKRCGISDLKDIRNALRVGEIGPELLQSLATRGLLKKVDCNLVRIWSHMTRQKKKFFEIRVSYHHVRTTSKSKLLLVPDTNPDAALQKKMSTRLCTRYGYAITGQRGEIRKIRREMDGQSERQIDGRTHSRAGHLPTDKPSKGAKRCNFSISTFKTV
g2
MLAASRYSRRSRKISRVQHWDVNDSRDISACNSLSSEKAPSVTRREWHEMRLLSRSRDRFADCLRTVAARVRGPIKPSPCSGRQCETVRWRIENGPLLTDQLAVFAQVRLALASTTSGQHPSGDNVGNDRYATRLYLLTSRGTSLVHRGNELKFDRRRWTNLSTILVYANEILRCEEGW
g3
MEYSQIFSPKLQVLSTKERLLTGNRTNISLENTRLGASIELGNPIHKWKPVAEQGSLGIEPSDGSSSLHDLPLYFRTRCETSNASTSDRIRKLTIREGKGFDDSQMTCMLIVDKPPKKTLTTLDKEWKEISIRMTTKVISRCDMAANEEDEDSSESIYHAPFEDAPCTVRTSLNFDTSALE
正则表达式解释:
该模式查找 '#start gene'
并捕获其背后事物的名称。然后它会跳过第一个 '['
之前的所有内容,捕获所有文本的下一个 ']'
之前的所有内容。
要写入文件,由于行拆分和其中的 # 符号,您需要对捕获的数据进行一些后处理。
我需要帮助才能提取 txt.file 的一部分,例如
#some comments
#some comments
#some comments
# Predicted genes for sequence number 1 on both strands
# start gene g1
scaffold_0 AUGUSTUS gene 1268 6647 0.19 - . g1
scaffold_0 AUGUSTUS transcript 1268 6647 0.19 - . g1.t1
scaffold_0 AUGUSTUS stop_codon 1268 1270 . - 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS intron 1457 6004 0.62 - . transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS intron 6194 6509 0.42 - . transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS CDS 1268 1456 0.62 - 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS CDS 6005 6193 0.52 - 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS CDS 6510 6647 0.3 - 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_0 AUGUSTUS start_codon 6645 6647 . - 0 transcript_id "g1.t1"; gene_id "g1";
# protein sequence = [MDASLTLHRFRKRGAKRCGISDLKDIRNALRVGEIGPELLQSLATRGLLKKVDCNLVRIWSHMTRQKKKFFEIRVSYH
# HVRTTSKSKLLLVPDTNPDAALQKKMSTRLCTRYGYAITGQRGEIRKIRREMDGQSERQIDGRTHSRAGHLPTDKPSKGAKRCNFSISTFKTV]
# end gene g1
###
# start gene g2
scaffold_0 AUGUSTUS gene 10257 16732 0.03 - . g2
scaffold_0 AUGUSTUS transcript 10257 16732 0.03 - . g2.t1
scaffold_0 AUGUSTUS stop_codon 10257 10259 . - 0 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS intron 10476 12506 0.18 - . transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS intron 12667 15174 0.18 - . transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS intron 15183 16579 0.2 - . transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS CDS 10257 10475 0.59 - 0 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS CDS 12507 12666 0.48 - 1 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS CDS 15175 15182 0.21 - 0 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS CDS 16580 16732 0.34 - 0 transcript_id "g2.t1"; gene_id "g2";
scaffold_0 AUGUSTUS start_codon 16730 16732 . - 0 transcript_id "g2.t1"; gene_id "g2";
# protein sequence = [MLAASRYSRRSRKISRVQHWDVNDSRDISACNSLSSEKAPSVTRREWHEMRLLSRSRDRFADCLRTVAARVRGPIKPS
# PCSGRQCETVRWRIENGPLLTDQLAVFAQVRLALASTTSGQHPSGDNVGNDRYATRLYLLTSRGTSLVHRGNELKFDRRRWTNLSTILVYANEILRCE
# EGW]
# end gene g2
###
# start gene g3
scaffold_0 AUGUSTUS gene 21625 24883 0.55 - . g3
scaffold_0 AUGUSTUS transcript 21625 24883 0.55 - . g3.t1
scaffold_0 AUGUSTUS stop_codon 21625 21627 . - 0 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS intron 21830 22738 0.68 - . transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS intron 22760 22851 0.61 - . transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS intron 22954 24665 0.89 - . transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS CDS 21625 21829 0.91 - 1 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS CDS 22739 22759 0.68 - 1 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS CDS 22852 22953 0.84 - 1 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS CDS 24666 24883 0.84 - 0 transcript_id "g3.t1"; gene_id "g3";
scaffold_0 AUGUSTUS start_codon 24881 24883 . - 0 transcript_id "g3.t1"; gene_id "g3";
# protein sequence = [MEYSQIFSPKLQVLSTKERLLTGNRTNISLENTRLGASIELGNPIHKWKPVAEQGSLGIEPSDGSSSLHDLPLYFRTR
# CETSNASTSDRIRKLTIREGKGFDDSQMTCMLIVDKPPKKTLTTLDKEWKEISIRMTTKVISRCDMAANEEDEDSSESIYHAPFEDAPCTVRTSLNFD
# TSALE]
# end gene g3
###
所以首先我有一个列表,例如 list_gene_names=['g1','g3']
我想遍历该列表并提取蛋白质序列(在括号之间)并将它们放入名为的文件中:sequence_file.fa
到目前为止我试过了:
file_to_parse=open('file_to_parse.txt','w')
with open("sequence_file.fa",'a') as sequence_file_output:
for names in list_gene_names:
(then we should locate the part between '''# start gene names''' and '''# end gene names''')
and get the part '''# protein sequence =''' between brackets"
print('>',names,file=sequence_file_output,sep='')
print(part_between_brackets,file=sequence_file_output)
最后文件 sequence_file.fa
应该如下所示:
>g1
MDASLTLHRFRKRGAKRCGISDLKDIRNALRVGEIGPELLQSLATRGLLKKVDCNLVRIWSHMTRQKKKFFEIRVSYHHVRTTSKSKLLLVPDTNPDAALQKKMSTRLCTRYGYAITGQRGEIRKIRREMDGQSERQIDGRTHSRAGHLPTDKPSKGAKRCNFSISTFKTV
>g2
MEYSQIFSPKLQVLSTKERLLTGNRTNISLENTRLGASIELGNPIHKWKPVAEQGSLGIEPSDGSSSLHDLPLYFRTRCETSNASTSDRIRKLTIREGKGFDDSQMTCMLIVDKPPKKTLTTLDKEWKEISIRMTTKVISRCDMAANEEDEDSSESIYHAPFEDAPCTVRTSLNFDTSALE
感谢您的帮助和时间
您可以使用正则表达式(模块 re
)。您必须捕获 # protein sequence = [
试试这个代码(这才刚刚开始):
import re
regex = r"^# protein sequence = [([A-Z]+)$"
l = re.findall(regex, content_file, re.M)
正则表达式是一种方式:
import re
regex = r"# start gene(.+?)$[^[]*\[([^\]]*)\]"
with open("sequence_file.fa",'a') as sequence_file_output:
test_str = sequence_file_output.read()
matches = re.finditer(regex, test_str, re.MULTILINE | re.DOTALL)
with open("result.txt","w") as f:
for matchNum, match in enumerate(matches, start=1):
# restrict to g1,g2:
if match.group(1).strip() in "g1,g2":
for groupNum in range(1, len(match.groups())+1):
f.write(match.group(groupNum).replace("#",
"").replace(" ","").replace("\n",""))
f.write("\n")
with open("result.txt") as f:
print(f.read())
输出:
g1
MDASLTLHRFRKRGAKRCGISDLKDIRNALRVGEIGPELLQSLATRGLLKKVDCNLVRIWSHMTRQKKKFFEIRVSYHHVRTTSKSKLLLVPDTNPDAALQKKMSTRLCTRYGYAITGQRGEIRKIRREMDGQSERQIDGRTHSRAGHLPTDKPSKGAKRCNFSISTFKTV
g2
MLAASRYSRRSRKISRVQHWDVNDSRDISACNSLSSEKAPSVTRREWHEMRLLSRSRDRFADCLRTVAARVRGPIKPSPCSGRQCETVRWRIENGPLLTDQLAVFAQVRLALASTTSGQHPSGDNVGNDRYATRLYLLTSRGTSLVHRGNELKFDRRRWTNLSTILVYANEILRCEEGW
g3
MEYSQIFSPKLQVLSTKERLLTGNRTNISLENTRLGASIELGNPIHKWKPVAEQGSLGIEPSDGSSSLHDLPLYFRTRCETSNASTSDRIRKLTIREGKGFDDSQMTCMLIVDKPPKKTLTTLDKEWKEISIRMTTKVISRCDMAANEEDEDSSESIYHAPFEDAPCTVRTSLNFDTSALE
正则表达式解释:
该模式查找 '#start gene'
并捕获其背后事物的名称。然后它会跳过第一个 '['
之前的所有内容,捕获所有文本的下一个 ']'
之前的所有内容。
要写入文件,由于行拆分和其中的 # 符号,您需要对捕获的数据进行一些后处理。