正在解析嵌套 brace/bracket 个组
Parsing nested brace/bracket groups
我正在尝试解析如下所示的文件:
MSH
[ PD1 ]
[{ ROL }]
[
{ ROL }
]
[
{
PR1
[{ ROL }]
}
]
[
{
IN1
[ IN2 ]
[{ IN3 }]
}
]
[ ACC ]
其中:
- 3 个字母数字代表一个 SEGMENT
- [ SEGMENT ] 表示可选段
- { SEGMENT } 表示一个重复段
- [{ SEGMENT }] 表示一个可选的重复段
- 上述任何 SEGMENT 配置文件都可以嵌套在一起
可选 ([]) and/or 重复 ({}) 组。
- 上面代码中的第 4 - 19 行是嵌套重复组的示例。
理想的结果应该是这样的:
{
"MSH": {
"name": "placeholder",
"opt": false,
"rep": false,
"description": "Plain Segment"
},
"PD1": {
"name": "placeholder",
"opt": true,
"rep": false,
"description": "Optional Segment"
},
// some segments here
"group": {
"opt": true,
"rep": false,
"description": "Optionals group placeholder text",
"segment0": {
"ROL": {
"name": "placeholder",
"opt": false,
"rep": true,
"description": "Repeating Segment"
}
}
}
}
我已经阅读了 SO 和 Pyparsing wiki 上的大部分 pyparsing 帖子,包括 fourFn.py 示例和 regexinverter。我相信我需要使用 Infixnotation
但我不太了解如何使用它。
这是我目前拥有的:
lbrack = pp.Literal("[")
rbrack = pp.Literal("]")
lbrace = pp.Literal("{")
rbrace = pp.Literal("}")
segment = pp.Word(pp.alphanums,exact=3)
optsegment = lbrack + segment + rbrack
repsegment = lbrace + segment + rbrace
optrepsegment = lbrack + lbrace + segment + rbrace + rbrack
segments = (segment.setResultsName("RawSegment") |
optsegment.setResultsName("OptionalSegment") |
repsegment.setResultsName("RepeatingSegment") |
optrepsegment.setResultsName("OptionalRepeatingSegment"))
opt_group = pp.Group(lbrack + segments + rbrack)
rep_group = pp.Group(lbrace + segments + rbrace)
message = pp.Group(segments | opt_group | rep_group)
expr = pp.infixNotation(message,
[
('[', 2, pp.opAssoc.LEFT),
('{', 2, pp.opAssoc.LEFT),
('}', 1, pp.opAssoc.RIGHT),
(']', 1, pp.opAssoc.RIGHT),
])
msg = message.searchString(data)
for item in msg:
print(item)
我还没有敲定输出格式,我只是想在这一点上正确解析输出。
这是百灵鸟的代码:
import json
import lark
l = lark.Lark("""
start: _segment
SIMPLE_SEGMENT: ("_"|LETTER|DIGIT)("_"|LETTER|DIGIT)("_"|LETTER|DIGIT)
o_segment: "["_segment"]"
r_segment: "{"_segment"}"
_segment: (SIMPLE_SEGMENT|o_segment|r_segment)+
%import common.LETTER
%import common.DIGIT
%import common.WS
%ignore WS
""", parser='lalr') # using lalr as parser is better than the default parser
class TreeTransformer(lark.Transformer):
@staticmethod
def o_segment(content):
if len(content) == 1 and isinstance(content[0], tuple) and content[0][0] == 'rep':
return "rep_opt", content[0][1]
return "opt", tuple(content) if len(content) != 1 else content[0]
@staticmethod
def r_segment(content):
return "rep", tuple(content) if len(content) != 1 else content[0]
def start(self, content):
out = []
for token in content:
if isinstance(token, str):
out.append({"name": "placeholder",
"opt": False,
"rep": False,
"description": "Plain Segment",
"token_name": token})
else:
if isinstance(token[1], str):
opt = 'opt' in token[0]
rep = 'rep' in token[0]
out.append({"name": "placeholder",
"opt": opt,
"rep": rep,
"description": ("Optional " if opt else '') + ("Repeating " if opt else '') + "Segment",
"token_name": token[1]})
else:
opt = 'opt' in token[0]
rep = 'rep' in token[0]
out.append({"name": "placeholder",
"opt": opt,
"rep": rep,
"description": ("Optional " if opt else '') + ("Repeating " if opt else '') + "Group",
"segments": self.start(token[1])})
return out
transformer = TreeTransformer()
tree = l.parse("""
MSH
[ PD1 ]
[{ ROL }]
[
{ ROL }
]
[
{
PR1
[{ ROL }]
}
]
[
{
IN1
[ IN2 ]
[{ IN3 }]
}
]
[ ACC ]
""")
print(json.dumps(transformer.transform(tree), indent=4))
我正在尝试解析如下所示的文件:
MSH
[ PD1 ]
[{ ROL }]
[
{ ROL }
]
[
{
PR1
[{ ROL }]
}
]
[
{
IN1
[ IN2 ]
[{ IN3 }]
}
]
[ ACC ]
其中:
- 3 个字母数字代表一个 SEGMENT
- [ SEGMENT ] 表示可选段
- { SEGMENT } 表示一个重复段
- [{ SEGMENT }] 表示一个可选的重复段
- 上述任何 SEGMENT 配置文件都可以嵌套在一起 可选 ([]) and/or 重复 ({}) 组。
- 上面代码中的第 4 - 19 行是嵌套重复组的示例。
理想的结果应该是这样的:
{
"MSH": {
"name": "placeholder",
"opt": false,
"rep": false,
"description": "Plain Segment"
},
"PD1": {
"name": "placeholder",
"opt": true,
"rep": false,
"description": "Optional Segment"
},
// some segments here
"group": {
"opt": true,
"rep": false,
"description": "Optionals group placeholder text",
"segment0": {
"ROL": {
"name": "placeholder",
"opt": false,
"rep": true,
"description": "Repeating Segment"
}
}
}
}
我已经阅读了 SO 和 Pyparsing wiki 上的大部分 pyparsing 帖子,包括 fourFn.py 示例和 regexinverter。我相信我需要使用 Infixnotation
但我不太了解如何使用它。
这是我目前拥有的:
lbrack = pp.Literal("[")
rbrack = pp.Literal("]")
lbrace = pp.Literal("{")
rbrace = pp.Literal("}")
segment = pp.Word(pp.alphanums,exact=3)
optsegment = lbrack + segment + rbrack
repsegment = lbrace + segment + rbrace
optrepsegment = lbrack + lbrace + segment + rbrace + rbrack
segments = (segment.setResultsName("RawSegment") |
optsegment.setResultsName("OptionalSegment") |
repsegment.setResultsName("RepeatingSegment") |
optrepsegment.setResultsName("OptionalRepeatingSegment"))
opt_group = pp.Group(lbrack + segments + rbrack)
rep_group = pp.Group(lbrace + segments + rbrace)
message = pp.Group(segments | opt_group | rep_group)
expr = pp.infixNotation(message,
[
('[', 2, pp.opAssoc.LEFT),
('{', 2, pp.opAssoc.LEFT),
('}', 1, pp.opAssoc.RIGHT),
(']', 1, pp.opAssoc.RIGHT),
])
msg = message.searchString(data)
for item in msg:
print(item)
我还没有敲定输出格式,我只是想在这一点上正确解析输出。
这是百灵鸟的代码:
import json
import lark
l = lark.Lark("""
start: _segment
SIMPLE_SEGMENT: ("_"|LETTER|DIGIT)("_"|LETTER|DIGIT)("_"|LETTER|DIGIT)
o_segment: "["_segment"]"
r_segment: "{"_segment"}"
_segment: (SIMPLE_SEGMENT|o_segment|r_segment)+
%import common.LETTER
%import common.DIGIT
%import common.WS
%ignore WS
""", parser='lalr') # using lalr as parser is better than the default parser
class TreeTransformer(lark.Transformer):
@staticmethod
def o_segment(content):
if len(content) == 1 and isinstance(content[0], tuple) and content[0][0] == 'rep':
return "rep_opt", content[0][1]
return "opt", tuple(content) if len(content) != 1 else content[0]
@staticmethod
def r_segment(content):
return "rep", tuple(content) if len(content) != 1 else content[0]
def start(self, content):
out = []
for token in content:
if isinstance(token, str):
out.append({"name": "placeholder",
"opt": False,
"rep": False,
"description": "Plain Segment",
"token_name": token})
else:
if isinstance(token[1], str):
opt = 'opt' in token[0]
rep = 'rep' in token[0]
out.append({"name": "placeholder",
"opt": opt,
"rep": rep,
"description": ("Optional " if opt else '') + ("Repeating " if opt else '') + "Segment",
"token_name": token[1]})
else:
opt = 'opt' in token[0]
rep = 'rep' in token[0]
out.append({"name": "placeholder",
"opt": opt,
"rep": rep,
"description": ("Optional " if opt else '') + ("Repeating " if opt else '') + "Group",
"segments": self.start(token[1])})
return out
transformer = TreeTransformer()
tree = l.parse("""
MSH
[ PD1 ]
[{ ROL }]
[
{ ROL }
]
[
{
PR1
[{ ROL }]
}
]
[
{
IN1
[ IN2 ]
[{ IN3 }]
}
]
[ ACC ]
""")
print(json.dumps(transformer.transform(tree), indent=4))