将正则表达式匹配到 python 中的多个行块
Matching regular expression to multiple line blocks in python
我在匹配多行文本时让 Python 正则表达式工作有点困难。
文件(doc.txt):
<sec name="M_20_K40745170" sound_freq="mhr17:7907527-7907589" tension="SGCGSCGSCGSCGSC" s_c="0">
<feature number="5748">
<tfgt v="0.1466469683747654" y="0.0" units="sec"/>
</feature>
<mwan sound_freq="mhr17:7907527-7907589" first_name="g7tty" description="xyz">
<xyz abc="trt" id="abc"/>
<per fre="accessions" value="abc"/>
<per fre="xyz" value="abc"/>
<per fre="yy" value="abc"/>
<per fre="psc" value="abc"/>
<per fre="ttt" value="1"/>
<per fre="xyz" value="abc"/>
<per fre="Volum_5Kb" value="89.00"/>
<per fre="Volum_40Kb" value="00.00"/>
<per fre="Volum_70Kb" value="77.00"/>
</mwan>
</com>
我要提取或匹配:
<sec name="M_20_K40745170" sound_freq="mhr17:7907527-7907589" tension="SGCGSCGSCGSCGSC" s_c="0">
pattern = re.compile(r'<sec name="(\D_\d\d_\w+)"\s+sound_freq="(\D\D\D\d+:\d+-\d+)"')
<per fre="Volum_5Kb" value="89.00"/>
<per fre="Volum_40Kb" value="00.00"/>
<per fre="Volum_70Kb" value="77.00"/>
pattern = re.compile(r'<sec name="(\D_\d\d_\w+)"\s+sound_freq="(\D\D\D\d+:\d+-\d+)"')
这通过以上两种不同的模式对我有用。
代码我试图同时获得两个模式:
import re
infile = open("/home/doc.txt")
np_array_values = []
pattern = re.compile(r'<sec name="(\D_\d\d_\w+)"\s+sound_freq="(\D\D\D\d+:\d+-\d+).*<per fre="(Volum_.*)"\s+value="(\d+.\d+)"/>"',re.DOTALL|re.MULTILINE)
fn_list = infile.readlines()
for line in fn_list:
search_obj = re.search(pattern, line)
print(search_obj)
if search_obj:
matching_group = search_obj.groups()
print(matching_group)
我怎样才能让它工作?
有很多方法可以解决这个问题,这里是一个。我已将信息添加到字典中,您将在其中获得字典列表作为输出。
def parse_doc(filename):
with open(filename) as f:
pattern1 = re.compile(r'<sec name="(\D_\d\d_\w+)"\s+sound_freq="(\D\D\D\d+:\d+-\d+)"')
pattern2 = re.compile(r'<per fre="(Volum_+\d+Kb)"+\svalue="(\d+.+)"')
doc = []
for i in f.readlines():
p1 = re.match(pattern1, i)
p2 = re.match(pattern2, i)
line = {}
if p1:
line.update({'sec': p1.group(1), 'sound_freq': p1.group(2)})
if p2:
line.update({p2.group(1): p2.group(2)})
if len(line)>0:
doc.append(line)
return doc
print(parse_doc('doc.txt'))
输出
[{'sec': 'M_20_K40745170', 'sound_freq': 'mhr17:7907527-7907589'}, {'Volum_5Kb': '89.00'}, {'Volum_40Kb': '00.00'}, {'Volum_70Kb': '77.00'}]
如果您想获取所有值,您可以使用以下方法获取它:
def parse_doc_all(filename):
with open(filename) as f:
pattern1 = re.compile(r'(.|\w+)="([^\s]+)"')
doc = {}
for i in f.readlines():
doc.update({p[0]: p[1] for p in re.findall(pattern1, i)})
return doc
print(parse_doc_all('doc.txt'))
哪个会给你
{'name': 'M_20_K40745170', 'sound_freq': 'mhr17:7907527-7907589', 'tension': 'SGCGSCGSCGSCGSC', 's_c': '0', 'number': '5748', 'v': '0.1466469683747654', 'y': '0.0', 'units': 'sec', 'first_name': 'g7tty', 'description': 'xyz', 'abc': 'trt', 'id': 'abc', 'fre': 'Volum_70Kb', 'value': '77.00'}
我在匹配多行文本时让 Python 正则表达式工作有点困难。
文件(doc.txt):
<sec name="M_20_K40745170" sound_freq="mhr17:7907527-7907589" tension="SGCGSCGSCGSCGSC" s_c="0">
<feature number="5748">
<tfgt v="0.1466469683747654" y="0.0" units="sec"/>
</feature>
<mwan sound_freq="mhr17:7907527-7907589" first_name="g7tty" description="xyz">
<xyz abc="trt" id="abc"/>
<per fre="accessions" value="abc"/>
<per fre="xyz" value="abc"/>
<per fre="yy" value="abc"/>
<per fre="psc" value="abc"/>
<per fre="ttt" value="1"/>
<per fre="xyz" value="abc"/>
<per fre="Volum_5Kb" value="89.00"/>
<per fre="Volum_40Kb" value="00.00"/>
<per fre="Volum_70Kb" value="77.00"/>
</mwan>
</com>
我要提取或匹配:
<sec name="M_20_K40745170" sound_freq="mhr17:7907527-7907589" tension="SGCGSCGSCGSCGSC" s_c="0">
pattern = re.compile(r'<sec name="(\D_\d\d_\w+)"\s+sound_freq="(\D\D\D\d+:\d+-\d+)"')
<per fre="Volum_5Kb" value="89.00"/>
<per fre="Volum_40Kb" value="00.00"/>
<per fre="Volum_70Kb" value="77.00"/>
pattern = re.compile(r'<sec name="(\D_\d\d_\w+)"\s+sound_freq="(\D\D\D\d+:\d+-\d+)"')
这通过以上两种不同的模式对我有用。 代码我试图同时获得两个模式:
import re
infile = open("/home/doc.txt")
np_array_values = []
pattern = re.compile(r'<sec name="(\D_\d\d_\w+)"\s+sound_freq="(\D\D\D\d+:\d+-\d+).*<per fre="(Volum_.*)"\s+value="(\d+.\d+)"/>"',re.DOTALL|re.MULTILINE)
fn_list = infile.readlines()
for line in fn_list:
search_obj = re.search(pattern, line)
print(search_obj)
if search_obj:
matching_group = search_obj.groups()
print(matching_group)
我怎样才能让它工作?
有很多方法可以解决这个问题,这里是一个。我已将信息添加到字典中,您将在其中获得字典列表作为输出。
def parse_doc(filename):
with open(filename) as f:
pattern1 = re.compile(r'<sec name="(\D_\d\d_\w+)"\s+sound_freq="(\D\D\D\d+:\d+-\d+)"')
pattern2 = re.compile(r'<per fre="(Volum_+\d+Kb)"+\svalue="(\d+.+)"')
doc = []
for i in f.readlines():
p1 = re.match(pattern1, i)
p2 = re.match(pattern2, i)
line = {}
if p1:
line.update({'sec': p1.group(1), 'sound_freq': p1.group(2)})
if p2:
line.update({p2.group(1): p2.group(2)})
if len(line)>0:
doc.append(line)
return doc
print(parse_doc('doc.txt'))
输出
[{'sec': 'M_20_K40745170', 'sound_freq': 'mhr17:7907527-7907589'}, {'Volum_5Kb': '89.00'}, {'Volum_40Kb': '00.00'}, {'Volum_70Kb': '77.00'}]
如果您想获取所有值,您可以使用以下方法获取它:
def parse_doc_all(filename):
with open(filename) as f:
pattern1 = re.compile(r'(.|\w+)="([^\s]+)"')
doc = {}
for i in f.readlines():
doc.update({p[0]: p[1] for p in re.findall(pattern1, i)})
return doc
print(parse_doc_all('doc.txt'))
哪个会给你
{'name': 'M_20_K40745170', 'sound_freq': 'mhr17:7907527-7907589', 'tension': 'SGCGSCGSCGSCGSC', 's_c': '0', 'number': '5748', 'v': '0.1466469683747654', 'y': '0.0', 'units': 'sec', 'first_name': 'g7tty', 'description': 'xyz', 'abc': 'trt', 'id': 'abc', 'fre': 'Volum_70Kb', 'value': '77.00'}