Python PLY 解析器的语法错误
Syntax error of Python PLY parser
我正在使用 Python PLY 编写简化的 MODULA-2 语法。
但是我遇到了语法错误:
$ python3 m2.py
Syntax error at 'MODULE'
而且我不知道规则有什么问题。
语法如下:
import ply.lex as lex
import ply.yacc as yacc
# =============================================================================
# Lexer rules
# =============================================================================
tokens = (
# Keywords
'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END',
# Contants
'NUMBER',
# Operators
'PLUS', 'MINUS', 'TIMES', 'DIV', 'MOD', 'ASSIGN_OP',
# Separators
'LPAR', 'RPAR', 'PERIOD', 'COLON', 'SEMICOLON',
# Identifier
'IDENT',
)
# Tokens
t_NUMBER = r'\d+'
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_LPAR = r'\('
t_RPAR = r'\)'
t_PERIOD = r'\.'
t_COLON = r':'
t_SEMICOLON = r';'
t_ASSIGN_OP = r':='
t_IDENT = r'[a-zA-Z][a-zA-Z0-9]*'
# Ignored characters
t_ignore = ' \t'
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# =============================================================================
# Parser rules
# =============================================================================
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIV'),
)
def p_add_operator(t):
""" add_operator : PLUS
| MINUS
"""
pass
def p_mul_operator(t):
""" mul_operator : TIMES
| DIV
| MOD
"""
pass
def p_simple_expression(t):
""" expression : term
| expression add_operator term
"""
pass
def p_term(t):
""" term : factor
| term mul_operator factor
"""
pass
def p_factor(t):
""" factor : NUMBER
| IDENT
| LPAR expression RPAR
"""
pass
def p_statement(t):
""" statement : IDENT
| IDENT ASSIGN_OP expression
| IF expression THEN statement_sequence END
| RETURN expression
"""
pass
def p_statement_sequence(t):
""" statement_sequence : statement
| statement_sequence SEMICOLON statement
"""
pass
def p_block(t):
""" block : declaration_list BEGIN statement_sequence END
"""
pass
def p_declaration_list(t):
""" declaration_list : declaration
| declaration_list declaration
"""
pass
def p_declaration(t):
""" declaration : VAR IDENT COLON IDENT SEMICOLON
"""
pass
def p_program_module(t):
""" program_module : MODULE IDENT SEMICOLON block IDENT PERIOD
"""
pass
def p_error(t):
print("Syntax error at '%s'" % t.value)
parser = yacc.yacc(start='program_module')
if __name__ == "__main__":
s = "MODULE test; VAR x: INTEGER; BEGIN x := 10 END test."
parser.parse(s)
有趣的是,为 lex/yacc 编写的相同语法规则运行良好。有人可以帮我解决这个问题吗?
AFAIK,ply.lex 没有足够的魔力知道你想要特殊的 MODULE
词作为标记 MODULE
.
根据你的定义,简单测试:
lexer.input("MODULE test; VAR x: INTEGER; BEGIN x := 10 END test.")
for tok in lexer:
print(tok)
输出:
LexToken(IDENT,'MODULE',1,0)
LexToken(IDENT,'test',1,7)
LexToken(SEMICOLON,';',1,11)
LexToken(IDENT,'VAR',1,13)
LexToken(IDENT,'x',1,17)
LexToken(COLON,':',1,18)
LexToken(IDENT,'INTEGER',1,20)
LexToken(SEMICOLON,';',1,27)
LexToken(IDENT,'BEGIN',1,29)
LexToken(IDENT,'x',1,35)
LexToken(ASSIGN_OP,':=',1,37)
LexToken(NUMBER,'10',1,40)
LexToken(IDENT,'END',1,43)
LexToken(IDENT,'test',1,47)
LexToken(PERIOD,'.',1,51)
处理关键字的正确方法是在 IDENT 令牌中识别它们:
=============================================================================
# Lexer rules
# =============================================================================
# Keywords
keywords = ( 'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END' )
tokens = keywords + (
# Contants
'NUMBER',
...
和
def t_IDENT(t):
r'[a-zA-Z][a-zA-Z0-9]*'
if t.value in keywords: # is this a keyword
t.type = t.value
return t
同一个词法分析器控件现在可以正确给出:
LexToken(MODULE,'MODULE',1,0)
LexToken(IDENT,'test',1,7)
LexToken(SEMICOLON,';',1,11)
LexToken(VAR,'VAR',1,13)
LexToken(IDENT,'x',1,17)
LexToken(COLON,':',1,18)
LexToken(IDENT,'INTEGER',1,20)
LexToken(SEMICOLON,';',1,27)
LexToken(BEGIN,'BEGIN',1,29)
LexToken(IDENT,'x',1,35)
LexToken(ASSIGN_OP,':=',1,37)
LexToken(NUMBER,'10',1,40)
LexToken(END,'END',1,43)
LexToken(IDENT,'test',1,47)
LexToken(PERIOD,'.',1,51)
并且解析没有错误。
我正在使用 Python PLY 编写简化的 MODULA-2 语法。
但是我遇到了语法错误:
$ python3 m2.py
Syntax error at 'MODULE'
而且我不知道规则有什么问题。
语法如下:
import ply.lex as lex
import ply.yacc as yacc
# =============================================================================
# Lexer rules
# =============================================================================
tokens = (
# Keywords
'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END',
# Contants
'NUMBER',
# Operators
'PLUS', 'MINUS', 'TIMES', 'DIV', 'MOD', 'ASSIGN_OP',
# Separators
'LPAR', 'RPAR', 'PERIOD', 'COLON', 'SEMICOLON',
# Identifier
'IDENT',
)
# Tokens
t_NUMBER = r'\d+'
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_LPAR = r'\('
t_RPAR = r'\)'
t_PERIOD = r'\.'
t_COLON = r':'
t_SEMICOLON = r';'
t_ASSIGN_OP = r':='
t_IDENT = r'[a-zA-Z][a-zA-Z0-9]*'
# Ignored characters
t_ignore = ' \t'
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# =============================================================================
# Parser rules
# =============================================================================
precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'TIMES', 'DIV'),
)
def p_add_operator(t):
""" add_operator : PLUS
| MINUS
"""
pass
def p_mul_operator(t):
""" mul_operator : TIMES
| DIV
| MOD
"""
pass
def p_simple_expression(t):
""" expression : term
| expression add_operator term
"""
pass
def p_term(t):
""" term : factor
| term mul_operator factor
"""
pass
def p_factor(t):
""" factor : NUMBER
| IDENT
| LPAR expression RPAR
"""
pass
def p_statement(t):
""" statement : IDENT
| IDENT ASSIGN_OP expression
| IF expression THEN statement_sequence END
| RETURN expression
"""
pass
def p_statement_sequence(t):
""" statement_sequence : statement
| statement_sequence SEMICOLON statement
"""
pass
def p_block(t):
""" block : declaration_list BEGIN statement_sequence END
"""
pass
def p_declaration_list(t):
""" declaration_list : declaration
| declaration_list declaration
"""
pass
def p_declaration(t):
""" declaration : VAR IDENT COLON IDENT SEMICOLON
"""
pass
def p_program_module(t):
""" program_module : MODULE IDENT SEMICOLON block IDENT PERIOD
"""
pass
def p_error(t):
print("Syntax error at '%s'" % t.value)
parser = yacc.yacc(start='program_module')
if __name__ == "__main__":
s = "MODULE test; VAR x: INTEGER; BEGIN x := 10 END test."
parser.parse(s)
有趣的是,为 lex/yacc 编写的相同语法规则运行良好。有人可以帮我解决这个问题吗?
AFAIK,ply.lex 没有足够的魔力知道你想要特殊的 MODULE
词作为标记 MODULE
.
根据你的定义,简单测试:
lexer.input("MODULE test; VAR x: INTEGER; BEGIN x := 10 END test.")
for tok in lexer:
print(tok)
输出:
LexToken(IDENT,'MODULE',1,0)
LexToken(IDENT,'test',1,7)
LexToken(SEMICOLON,';',1,11)
LexToken(IDENT,'VAR',1,13)
LexToken(IDENT,'x',1,17)
LexToken(COLON,':',1,18)
LexToken(IDENT,'INTEGER',1,20)
LexToken(SEMICOLON,';',1,27)
LexToken(IDENT,'BEGIN',1,29)
LexToken(IDENT,'x',1,35)
LexToken(ASSIGN_OP,':=',1,37)
LexToken(NUMBER,'10',1,40)
LexToken(IDENT,'END',1,43)
LexToken(IDENT,'test',1,47)
LexToken(PERIOD,'.',1,51)
处理关键字的正确方法是在 IDENT 令牌中识别它们:
=============================================================================
# Lexer rules
# =============================================================================
# Keywords
keywords = ( 'RETURN', 'IF', 'THEN', 'VAR', 'MODULE', 'BEGIN', 'END' )
tokens = keywords + (
# Contants
'NUMBER',
...
和
def t_IDENT(t):
r'[a-zA-Z][a-zA-Z0-9]*'
if t.value in keywords: # is this a keyword
t.type = t.value
return t
同一个词法分析器控件现在可以正确给出:
LexToken(MODULE,'MODULE',1,0)
LexToken(IDENT,'test',1,7)
LexToken(SEMICOLON,';',1,11)
LexToken(VAR,'VAR',1,13)
LexToken(IDENT,'x',1,17)
LexToken(COLON,':',1,18)
LexToken(IDENT,'INTEGER',1,20)
LexToken(SEMICOLON,';',1,27)
LexToken(BEGIN,'BEGIN',1,29)
LexToken(IDENT,'x',1,35)
LexToken(ASSIGN_OP,':=',1,37)
LexToken(NUMBER,'10',1,40)
LexToken(END,'END',1,43)
LexToken(IDENT,'test',1,47)
LexToken(PERIOD,'.',1,51)
并且解析没有错误。