如何使用带 python 的正则表达式读取可选的整数元组?
How to read an optional tuple of integers using a regex with python?
我尝试实现一个正则表达式来读取如下行:
* DCH : 0.80000000 *
* PYR : 100.00000000 *
* Bond ( 1, 0) : 0.80000000 *
* Angle ( 1, 0, 2) : 100.00000000 *
为此,我编写了以下正则表达式。它有效,但我想就获取括号中整数的方式提供一些反馈。在上面的第3行和第4行中,括号之间的整数部分(一种整数元组)是可选的。
我必须定义几个组才能将整数元组定义为可选,并管理该元组可能包含 2、3 或 4 个整数的事实。
In [64]: coord_patt = re.compile(r"\s+(\w+)\s+(\(((\s*\d+),?){2,4}\))?\s+:\s+(\d+.\d+)")
In [65]: line2 = "* Angle ( 1, 0, 2) : 100.00000000 *"
In [66]: m = coord_patt.search(line2)
In [67]: m.groups()
Out[67]: ('Angle', '( 1, 0, 2)', ' 2', ' 2', '100.00000000')
另一个例子:
In [68]: line = " * Bond ( 1, 0) : 0.80000000 *"
In [69]: m = coord_patt.search(line)
In [71]: m.groups()
Out[71]: ('Bond', '( 1, 0)', ' 0', ' 0', '0.80000000')
如您所见,它有效,但我不明白为什么在组中,我只得到最后一个整数而不是分别得到每个整数?有没有一种方法可以单独获取这些整数,或者避免定义所有这些组并只捕获第 2 组,它是元组的一个字符串,否则可以很容易地读取。
如 Capturing repeating subpatterns in Python regex, the re
module doesn't support repeated captures, but regex
所示。
这里有两种解决方案,一种是基于regex
,一种是基于re
和遇到一个时元组的安全求值
设置
txt = r"""* DCH : 0.80000000 *
* PYR : 100.00000000 *
* Bond ( 1, 0) : 0.80000000 *
* Angle ( 1, 0, 2) : 100.00000000 *
"""
使用regex
import regex
p = regex.compile(r'\s+(\w+)\s+(?:\((?:\s*(\d+),?){2,4}\))?\s+:\s+(\d+.\d+)')
for s in txt.splitlines():
if m := p.search(s):
name = m.group(1)
tup = tuple(int(k) for k in m.captures(2) if k.isnumeric())
val = float(m.group(3))
print(f'{name!r}\t{tup!r}\t{val!r}')
打印:
'DCH' () 0.8
'PYR' () 100.0
'Bond' (1, 0) 0.8
'Angle' (1, 0, 2) 100.0
使用re
import re
import ast
p = re.compile(r'\s+(\w+)\s+(\((?:\s*\d+,?){2,4}\))?\s+:\s+(\d+.\d+)')
for s in txt.splitlines():
if m := p.search(s):
name, tup, val = m.groups()
tup = ast.literal_eval(tup) if tup is not None else ()
val = float(val)
print(f'{name!r}\t{tup!r}\t{val!r}')
打印:
'DCH' () 0.8
'PYR' () 100.0
'Bond' (1, 0) 0.8
'Angle' (1, 0, 2) 100.0
要将括号内的整数(当前为 string
转换为 int
,您必须将其转换为 tuple
。
它将所有元组存储在一个列表中,您稍后可以检索该列表以对其进行操作。
import re
from ast import literal_eval as make_tuple
lines = [
"* DCH : 0.80000000 *",
"* PYR : 100.00000000 *",
" Bond ( 1, 0) : 0.80000000 *",
"* Angle ( 1, 0, 2) : 100.00000000 *",
]
coord_patt = re.compile(r"\s+(\w+)\s+(\(((\s*\d+),?){2,4}\))?\s+:\s+(\d+.\d+)")
tuples = list()
for line in lines:
temp = coord_patt.search(line)
if temp.groups()[1] is not None:
tuples.append(make_tuple(temp.groups()[1]))
print(tuples)
for tup in tuples:
for element in tup:
print(element, end=' ')
print()
这是输出
Output:
[(1, 0), (1, 0, 2)]
1 0
1 0 2
这可能比您正在寻找的要复杂得多,但我想我还是会把它作为添加到您的“工具箱”中的东西来展示,因为它实际上是一个顶级的,可以处理更复杂的情况-down 解析器,因此能够处理不能由正则表达式定义的语言。
from typing import NamedTuple
import re
lines = """
DCH : 0.80000000
PYR : 100.00000000
Bond ( 1, 0) : 0.80000000
Angle ( 1, 0, 2) : 100.00000000
"""
class Token(NamedTuple):
type: str
value: str
# The order of these matters because they are tried in turn:
token_specification = [
('WORD', r'[a-zA-Z]\w*'), # cannot use `\w+` since that would also match numbers
('COLON', r':'),
('LPAREN', r'\('),
('RPAREN', r'\)'),
('FLOAT', r'\d+\.\d+'),
('INT', r'\d+'),
('COMMA', r','),
('SKIP', r'\s+'),
('ERROR', r'.') # anything else
]
tok_regex = re.compile('|'.join('(?P<%s>%s)' % pair for pair in token_specification))
def generate_tokens(code):
scanner = tok_regex.scanner(code)
for m in iter(scanner.match, None):
type = m.lastgroup
if type == 'SKIP':
continue
if type == 'FLOAT':
value = float(m.group()) # or just m.group()
elif type == 'INT':
value = int(m.group()) # or just m.group()
else:
value = m.group()
yield Token(type, value)
yield Token('EOF', 'EOF') # end of string
class Evaluator():
def parse(self, s):
self.token_iterator = generate_tokens(s)
self.next_token()
try:
while self.token.type != 'EOF': # not end of string
yield self.evaluate()
except Exception:
pass
def evaluate(self):
# current token should be WORD
word_value = self.token.value
self.accept('WORD') # throw exception if not 'WORD'
i_list = self.optional_int_list()
# current token should be a colon
self.accept('COLON')
# current token should be a float
float_value = self.token.value
self.accept('FLOAT')
return word_value, i_list, float_value
def optional_int_list(self):
i_list = []
if self.token.type == 'LPAREN':
self.next_token()
# current token should be an integer
i_list.append(self.token.value)
self.accept('INT')
while self.token.type == 'COMMA':
self.next_token()
# next token should be an integer
i_list.append(self.token.value)
self.accept('INT')
# next token should be a right parentheses:
self.accept('RPAREN')
return i_list
def next_token(self):
self.token = next(self.token_iterator, None)
def accept(self, type):
if self.token.type != type:
raise Exception(f'Error: was expecting a {type}, got {self.token.type}')
self.next_token()
evaluator = Evaluator()
for word_value, integer_values, float_value in evaluator.parse(lines):
print(word_value, integer_values, float_value)
打印:
DCH [] 0.8
PYR [] 100.0
Bond [1, 0] 0.8
Angle [1, 0, 2] 100.0
我尝试实现一个正则表达式来读取如下行:
* DCH : 0.80000000 *
* PYR : 100.00000000 *
* Bond ( 1, 0) : 0.80000000 *
* Angle ( 1, 0, 2) : 100.00000000 *
为此,我编写了以下正则表达式。它有效,但我想就获取括号中整数的方式提供一些反馈。在上面的第3行和第4行中,括号之间的整数部分(一种整数元组)是可选的。
我必须定义几个组才能将整数元组定义为可选,并管理该元组可能包含 2、3 或 4 个整数的事实。
In [64]: coord_patt = re.compile(r"\s+(\w+)\s+(\(((\s*\d+),?){2,4}\))?\s+:\s+(\d+.\d+)")
In [65]: line2 = "* Angle ( 1, 0, 2) : 100.00000000 *"
In [66]: m = coord_patt.search(line2)
In [67]: m.groups()
Out[67]: ('Angle', '( 1, 0, 2)', ' 2', ' 2', '100.00000000')
另一个例子:
In [68]: line = " * Bond ( 1, 0) : 0.80000000 *"
In [69]: m = coord_patt.search(line)
In [71]: m.groups()
Out[71]: ('Bond', '( 1, 0)', ' 0', ' 0', '0.80000000')
如您所见,它有效,但我不明白为什么在组中,我只得到最后一个整数而不是分别得到每个整数?有没有一种方法可以单独获取这些整数,或者避免定义所有这些组并只捕获第 2 组,它是元组的一个字符串,否则可以很容易地读取。
如 Capturing repeating subpatterns in Python regex, the re
module doesn't support repeated captures, but regex
所示。
这里有两种解决方案,一种是基于regex
,一种是基于re
和遇到一个时元组的安全求值
设置
txt = r"""* DCH : 0.80000000 *
* PYR : 100.00000000 *
* Bond ( 1, 0) : 0.80000000 *
* Angle ( 1, 0, 2) : 100.00000000 *
"""
使用regex
import regex
p = regex.compile(r'\s+(\w+)\s+(?:\((?:\s*(\d+),?){2,4}\))?\s+:\s+(\d+.\d+)')
for s in txt.splitlines():
if m := p.search(s):
name = m.group(1)
tup = tuple(int(k) for k in m.captures(2) if k.isnumeric())
val = float(m.group(3))
print(f'{name!r}\t{tup!r}\t{val!r}')
打印:
'DCH' () 0.8
'PYR' () 100.0
'Bond' (1, 0) 0.8
'Angle' (1, 0, 2) 100.0
使用re
import re
import ast
p = re.compile(r'\s+(\w+)\s+(\((?:\s*\d+,?){2,4}\))?\s+:\s+(\d+.\d+)')
for s in txt.splitlines():
if m := p.search(s):
name, tup, val = m.groups()
tup = ast.literal_eval(tup) if tup is not None else ()
val = float(val)
print(f'{name!r}\t{tup!r}\t{val!r}')
打印:
'DCH' () 0.8
'PYR' () 100.0
'Bond' (1, 0) 0.8
'Angle' (1, 0, 2) 100.0
要将括号内的整数(当前为 string
转换为 int
,您必须将其转换为 tuple
。
它将所有元组存储在一个列表中,您稍后可以检索该列表以对其进行操作。
import re
from ast import literal_eval as make_tuple
lines = [
"* DCH : 0.80000000 *",
"* PYR : 100.00000000 *",
" Bond ( 1, 0) : 0.80000000 *",
"* Angle ( 1, 0, 2) : 100.00000000 *",
]
coord_patt = re.compile(r"\s+(\w+)\s+(\(((\s*\d+),?){2,4}\))?\s+:\s+(\d+.\d+)")
tuples = list()
for line in lines:
temp = coord_patt.search(line)
if temp.groups()[1] is not None:
tuples.append(make_tuple(temp.groups()[1]))
print(tuples)
for tup in tuples:
for element in tup:
print(element, end=' ')
print()
这是输出
Output:
[(1, 0), (1, 0, 2)]
1 0
1 0 2
这可能比您正在寻找的要复杂得多,但我想我还是会把它作为添加到您的“工具箱”中的东西来展示,因为它实际上是一个顶级的,可以处理更复杂的情况-down 解析器,因此能够处理不能由正则表达式定义的语言。
from typing import NamedTuple
import re
lines = """
DCH : 0.80000000
PYR : 100.00000000
Bond ( 1, 0) : 0.80000000
Angle ( 1, 0, 2) : 100.00000000
"""
class Token(NamedTuple):
type: str
value: str
# The order of these matters because they are tried in turn:
token_specification = [
('WORD', r'[a-zA-Z]\w*'), # cannot use `\w+` since that would also match numbers
('COLON', r':'),
('LPAREN', r'\('),
('RPAREN', r'\)'),
('FLOAT', r'\d+\.\d+'),
('INT', r'\d+'),
('COMMA', r','),
('SKIP', r'\s+'),
('ERROR', r'.') # anything else
]
tok_regex = re.compile('|'.join('(?P<%s>%s)' % pair for pair in token_specification))
def generate_tokens(code):
scanner = tok_regex.scanner(code)
for m in iter(scanner.match, None):
type = m.lastgroup
if type == 'SKIP':
continue
if type == 'FLOAT':
value = float(m.group()) # or just m.group()
elif type == 'INT':
value = int(m.group()) # or just m.group()
else:
value = m.group()
yield Token(type, value)
yield Token('EOF', 'EOF') # end of string
class Evaluator():
def parse(self, s):
self.token_iterator = generate_tokens(s)
self.next_token()
try:
while self.token.type != 'EOF': # not end of string
yield self.evaluate()
except Exception:
pass
def evaluate(self):
# current token should be WORD
word_value = self.token.value
self.accept('WORD') # throw exception if not 'WORD'
i_list = self.optional_int_list()
# current token should be a colon
self.accept('COLON')
# current token should be a float
float_value = self.token.value
self.accept('FLOAT')
return word_value, i_list, float_value
def optional_int_list(self):
i_list = []
if self.token.type == 'LPAREN':
self.next_token()
# current token should be an integer
i_list.append(self.token.value)
self.accept('INT')
while self.token.type == 'COMMA':
self.next_token()
# next token should be an integer
i_list.append(self.token.value)
self.accept('INT')
# next token should be a right parentheses:
self.accept('RPAREN')
return i_list
def next_token(self):
self.token = next(self.token_iterator, None)
def accept(self, type):
if self.token.type != type:
raise Exception(f'Error: was expecting a {type}, got {self.token.type}')
self.next_token()
evaluator = Evaluator()
for word_value, integer_values, float_value in evaluator.parse(lines):
print(word_value, integer_values, float_value)
打印:
DCH [] 0.8
PYR [] 100.0
Bond [1, 0] 0.8
Angle [1, 0, 2] 100.0