如何生成歧义语法的所有可能解析

How to generate all possible parsings of an ambiguous grammar

我查看了 quite a few grammar parsers, but none of them seem to be able to generate all parsings of an ambiguous grammar. (I've also checked these ,它没有为我的问题提供任何有用的解决方案。)

我怎样才能生成所有歧义语法的解析?

For example,想象一下我的(模棱两可的)语法是

E -> E E
   | id

那么,id id id 有两个有效的解析。可以是

  1. E(E(id) E(id)) E(id)
  2. E(id) E(E(id) E(id)).

如何在语法不明确的情况下生成 Python 中字符串的所有有效解析?

明确地说,我不太关心输入和输出的格式,只要它可以接受语法和字符串并使用该语法输出该字符串的所有有效解析即可。

根据语法的复杂程度,您可能可以自己实现。大多数库都进行了优化以帮助解决歧义,但如果你想要所有选项,你可以只做一个简单的递归:

import abc


class Rule(metaclass=abc.ABCMeta):
  @abc.abstractmethod
  def parse(self, data):
    """Returns a generator of (result, remaining) tuples.
    
    Parses data and returns all possible tuples of result (of parsing so far)
    and remaining (data to be parsed). **Result must be a list**
    """
    raise NotImplementedError()

class Node:
  """A tree node we can nicely print"""
  def __init__(self, name, *values):
    self.name = name
    self.values = list(values)
  
  def __repr__(self) -> str:
      return str(self)

  def __str__(self):
    return f'{self.name}({",".join(str(v) for v in self.values)})'


class Literal(Rule):
  """A literal which we expect to find as is in the input."""
  def __init__(self, value, *, name='Literal'):
    self.value = value
    self.name = name
  
  def parse(self, data):
    if not data:
      return
    if data[0] == self.value:
      yield ([Node(self.name, self.value)], data[1:])


class Sequence(Rule):
  """A sequence of one-or-more rules to find in the input."""
  def __init__(self, *rules, name='Sequence'):
    self.rules = list(rules)
    self.name = name

  def recursive_parse_helper(self, next_rules, data):
    if len(next_rules) == 1:
      yield from next_rules[0].parse(data)
      return
    for result, more_data in next_rules[0].parse(data):
      for remaining_result, remaining_data in self.recursive_parse_helper(next_rules[1:], more_data):
        yield ([Node(self.name, *(result + remaining_result))], remaining_data)

  def parse(self, data):
    yield from self.recursive_parse_helper(self.rules, data)


class OneOf(Rule):
  """A list of one-or-more rules that can be used to match the input."""
  def __init__(self, *rules):
    self.rules = list(rules)

  def parse(self, data):
    for rule in self.rules:
      yield from rule.parse(data)


def parse(rule, data):
  """Parse the input until no more data is remaining. Return only full matches."""
  for result, remaining in rule.parse(data):
    if remaining:
      for more_results in parse(rule, remaining):
        yield result + more_results
    else:
      yield result

这很好用:

E0 = Literal('id', name='E')
E = OneOf(Sequence(E0, E0), E0)
> list(parse(E, ['id', 'id', 'id']))
[[Sequence(E(id),E(id)), E(id)],
 [E(id), Sequence(E(id),E(id))],
 [E(id), E(id), E(id)]]

一个选择是根据语法规则保留一个 运行ning 列表,其中包含正在构建到抽象语法树中的标记。该算法可以分为两部分:

  1. 读入一个令牌并将其推送到存储正在构建的 AST 的每个列表。
  2. 语法规则 运行 针对每个子列表,衍生出后续的潜在匹配项。

使用字典定义语法:

grammar = [
    ('E', {'op':'or',
           'left':{'op':'and', 'left':'E', 'right':'E'},
           'right':'id'
          }
      )
]

src = 'id id id'
def matches(name, pattern, tree):
   if not tree:
      return tree, None, 0
   if isinstance(pattern, str):
      if tree[-1]['name'] == pattern:
         return tree[:-1], {'name':name, 'value':tree[-1]} if 'value' not in tree[-1] else tree[-1], 1
      return tree, None, 0
   if pattern['op']=='or':
      if (m:=matches(name, pattern['right'], tree))[-1] or (m:=matches(name, pattern['left'], tree))[-1]:
        return m
      return tree, None, 0
   if (m1:=matches(name, pattern['right'], tree))[-1] and (m2:=matches(name, pattern['left'], m1[0]))[-1]:
      return m2[0], {'name':name, 'left':m1[1], 'right':m2[1]}, 1
   return tree, None, 0

def reduce(tree):
  yield (tree[:-1], tree[-1], 0)
  for a, b in grammar:
    if (m:=matches(a, b, tree))[-1]:
      yield m
    
def get_asts(stream, queue=[]):
  if queue:
     if not stream and len(queue)==1:
        yield queue
     for a, b, c in reduce(queue):
       if c:
          yield from get_asts(stream, a+[b])
  if stream:
     yield from get_asts(stream[1:], queue+[stream[0]])

t = [{'name':i} for i in src.split()]

import json
for i in get_asts(t):
   print(json.dumps(i, indent=4))
[
    {
        "name": "E",
        "left": {
            "name": "E",
            "value": {
                "name": "id"
            }
        },
        "right": {
            "name": "E",
            "value": {
                "name": "E",
                "left": {
                    "name": "E",
                    "value": {
                        "name": "id"
                    }
                },
                "right": {
                    "name": "E",
                    "value": {
                        "name": "id"
                    }
                }
            }
        }
    }
]
[
    {
        "name": "E",
        "left": {
            "name": "E",
            "value": {
                "name": "E",
                "left": {
                    "name": "E",
                    "value": {
                        "name": "id"
                    }
                },
                "right": {
                    "name": "E",
                    "value": {
                        "name": "id"
                    }
                }
            }
        },
        "right": {
            "name": "E",
            "value": {
                "name": "id"
            }
        }
    }
]

这不是推荐,我确信还有其他 Python 解析器生成器能够生成 Earley 解析器(或其变体)。

但是当然可以使用 Lark(来自 OP 的 link)来生成解析林。 Select 一个“earley”解析器并将 ambiguity 选项设置为“forest”(如果没有太多歧义,则设置为“explicit”)。