当属性值包含关键字时尝试使用 antlr2 解析 edifact 文件时出错
Error when trying to parse an edifact-file with antlr2 when an attribute value contains a keyword
我有一项忘恩负义的任务是修复用于解析 edifact 文件的旧 antlr2 解析器中的错误。不幸的是,我根本不熟悉 antlr2 或解析器,我无法让它工作。
edifact 文件如下所示:
ABC+Name+Surname+zip+city+street+country+1961219++0037141008'
XYZ+Company+++XYZ+zip+street'
LMN+20081010+1100'
有几个不同的段,以关键字开头。例如。 XYZ 或 ABC。关键字后跟不同的属性值,所有属性值都用“+”分隔。属性值可能为空。每个段以 '.
结尾
问题是,只要数据属性包含关键字,解析器就会抛出错误:
意外标记:XYZ
XYZ+公司+++XYZ+zip+街道'
这是语法文件的摘录:
// $ANTLR 2.7.6
xyz: "XYZ" ELT_SEP!
(xyz1_1a:ANUM|xyz1_1b:NUM) {lq(90,xyz1_1a,xyz1_1b,"XYZ1-1"+LQ90)}? ELT_SEP!
(xyz1_2a:ANUM|xyz1_2b:NUM)? {lq_(90,xyz1_2a,xyz1_2b,"XYZ1-2"+LQ90)}? ELT_SEP!
(xyz1_3a:ANUM|xyz1_3b:NUM)? {lq_(90,xyz1_3a,xyz1_3b,"XYZ1-3"+LQ90)}? ELT_SEP!
(xyz2a:ANUM|xyz2b:NUM)? {lq_(3,xyz2a,xyz2b,"XYZ2"+LQ3)}? ELT_SEP!
(xyz3a:ANUM|xyz3b:NUM)? {lq_(6,xyz3a,xyz3b,"XYZ3"+LQ6)}? ELT_SEP!
(xyz4a:ANUM|xyz4b:NUM) {lq(30,xyz4a,xyz4b,"XYZ4"+LQ30)}?
(ELT_SEP! (xyz5a:ANUM|xyz5b:NUM)?)? {lq_(46,xyz5a,xyz5b,"XYZ5"+LQ46)}? SEG_TERM!
{
if (skipNachricht()) return;
Xyz xyz = new Xyz();
xyz.xyz1_1 = getText(nn(xyz1_1a, xyz1_1b));
xyz.xyz1_2 = getText(nn(xyz1_2a, xyz1_2b));
xyz.xyz1_3 = getText(nn(xyz1_3a, xyz1_3b));
xyz.xyz2 = getText(nn(xyz2a, xyz2b));
xyz.xyz3 = getText(nn(xyz3a, xyz3b));
xyz.xyz4 = getText(nn(xyz4a, xyz4b));
xyz.xyz5 = getText(nn(xyz5a, xyz5b));
handleXyz(xyz);
}
;
/*
* Lexer
*/
class EdifactLexer extends Lexer;
options {
k=2;
filter=true;
charVocabulary = ''..'7'; // Latin
}
DEZ_SEP: ','
{
//System.out.println("Found dez_sep: " + getText());
}
;
ELT_SEP: '+'
{
//System.out.println("Found elt_sep: " + getText());
}
;
SEG_TERM: '\''
{
// System.out.println("Found seg_term: " + getText());
}
;
NUM: (('0'..'9')+ (',' ('0'..'9')+)? ('+' | '\''))
=> ('0'..'9')+ (',' ('0'..'9')+)?
{
//System.out.println("num_: " + getText());
}
|
((ESCAPED | ~('?' | '+' | '\'' | ',' | '\r' | '\n'))+ )
=> ( ESCAPED | ~('?' | '+' | '\'' | ',' | '\r' | '\n'))+
{
$setType(ANUM);
//System.out.println("anum: " + getText());
}
|
(WRONGLY_ESCAPED) => WRONGLY_ESCAPED
{$setType(WRONGLY_ESCAPED); }
;
protected
WRONGLY_ESCAPED: '?' ~('?' | ':' | '+' | '\'' | ',')
{
//System.out.println("Found wrong_escaped: " + getText());
}
;
protected
ESCAPED: '?'
( ',' {$setText(","); }
| '?' {$setText("?"); }
| '\'' {$setText("'"); }
| ':' {$setText(":"); }
| '+' {$setText("+"); }
)
{
//System.out.println("Found escaped: " + getText());
}
;
NEWLINE : ( "\r\n" // DOS
| '\r' // MAC
| '\n' // Unix
)
{ newline();
$setType(Token.SKIP);
}
;
非常感谢任何帮助:)。
这可能不是最好的解决方案,但我终于找到了解决问题的方法。
所以,如果有人遇到类似问题,这是我的解决方案:
我写了一个方法,如果当前标记类型匹配我的任何关键字,它将标记类型更改为 ANUM:
void ckt() throws TokenStreamException, SemanticException {
if (mKeywordList.contains(LT(1).getType())) {
LT(1).setType(ANUM);
}
}
在尝试访问 ANUM-Token:
之前,在我的解析器规则中调用了该方法
xyz: "XYZ" ELT_SEP!
{ckt();}(xyz1_1a:ANUM|xyz1_1b:NUM) {lq(90,xyz1_1a,xyz1_1b,"XYZ1-1"+LQ90)}? ELT_SEP!
{ckt();}(xyz1_2a:ANUM|xyz1_2b:NUM)? {lq_(90,xyz1_2a,xyz1_2b,"XYZ1-2"+LQ90)}? ELT_SEP!
{ckt();}(xyz1_3a:ANUM|xyz1_3b:NUM)? {lq_(90,xyz1_3a,xyz1_3b,"XYZ1-3"+LQ90)}? ELT_SEP!
{ckt();}(xyz2a:ANUM|xyz2b:NUM)? {lq_(3,xyz2a,xyz2b,"XYZ2"+LQ3)}? ELT_SEP!
{ckt();}(xyz3a:ANUM|xyz3b:NUM)? {lq_(6,xyz3a,xyz3b,"XYZ3"+LQ6)}? ELT_SEP!
{ckt();}(xyz4a:ANUM|xyz4b:NUM) {lq(30,xyz4a,xyz4b,"XYZ4"+LQ30)}?
(ELT_SEP! {ckt();}(xyz5a:ANUM|xyz5b:NUM)?)? {lq_(46,xyz5a,xyz5b,"XYZ5"+LQ46)}? SEG_TERM!
{
if (skipNachricht()) return;
Xyz xyz = new Xyz();
xyz.xyz1_1 = getText(nn(xyz1_1a, xyz1_1b));
xyz.xyz1_2 = getText(nn(xyz1_2a, xyz1_2b));
xyz.xyz1_3 = getText(nn(xyz1_3a, xyz1_3b));
xyz.xyz2 = getText(nn(xyz2a, xyz2b));
xyz.xyz3 = getText(nn(xyz3a, xyz3b));
xyz.xyz4 = getText(nn(xyz4a, xyz4b));
xyz.xyz5 = getText(nn(xyz5a, xyz5b));
handleXyz(xyz);
}
;
我有一项忘恩负义的任务是修复用于解析 edifact 文件的旧 antlr2 解析器中的错误。不幸的是,我根本不熟悉 antlr2 或解析器,我无法让它工作。
edifact 文件如下所示:
ABC+Name+Surname+zip+city+street+country+1961219++0037141008'
XYZ+Company+++XYZ+zip+street'
LMN+20081010+1100'
有几个不同的段,以关键字开头。例如。 XYZ 或 ABC。关键字后跟不同的属性值,所有属性值都用“+”分隔。属性值可能为空。每个段以 '.
结尾问题是,只要数据属性包含关键字,解析器就会抛出错误:
意外标记:XYZ
XYZ+公司+++XYZ+zip+街道'
这是语法文件的摘录:
// $ANTLR 2.7.6
xyz: "XYZ" ELT_SEP!
(xyz1_1a:ANUM|xyz1_1b:NUM) {lq(90,xyz1_1a,xyz1_1b,"XYZ1-1"+LQ90)}? ELT_SEP!
(xyz1_2a:ANUM|xyz1_2b:NUM)? {lq_(90,xyz1_2a,xyz1_2b,"XYZ1-2"+LQ90)}? ELT_SEP!
(xyz1_3a:ANUM|xyz1_3b:NUM)? {lq_(90,xyz1_3a,xyz1_3b,"XYZ1-3"+LQ90)}? ELT_SEP!
(xyz2a:ANUM|xyz2b:NUM)? {lq_(3,xyz2a,xyz2b,"XYZ2"+LQ3)}? ELT_SEP!
(xyz3a:ANUM|xyz3b:NUM)? {lq_(6,xyz3a,xyz3b,"XYZ3"+LQ6)}? ELT_SEP!
(xyz4a:ANUM|xyz4b:NUM) {lq(30,xyz4a,xyz4b,"XYZ4"+LQ30)}?
(ELT_SEP! (xyz5a:ANUM|xyz5b:NUM)?)? {lq_(46,xyz5a,xyz5b,"XYZ5"+LQ46)}? SEG_TERM!
{
if (skipNachricht()) return;
Xyz xyz = new Xyz();
xyz.xyz1_1 = getText(nn(xyz1_1a, xyz1_1b));
xyz.xyz1_2 = getText(nn(xyz1_2a, xyz1_2b));
xyz.xyz1_3 = getText(nn(xyz1_3a, xyz1_3b));
xyz.xyz2 = getText(nn(xyz2a, xyz2b));
xyz.xyz3 = getText(nn(xyz3a, xyz3b));
xyz.xyz4 = getText(nn(xyz4a, xyz4b));
xyz.xyz5 = getText(nn(xyz5a, xyz5b));
handleXyz(xyz);
}
;
/*
* Lexer
*/
class EdifactLexer extends Lexer;
options {
k=2;
filter=true;
charVocabulary = ''..'7'; // Latin
}
DEZ_SEP: ','
{
//System.out.println("Found dez_sep: " + getText());
}
;
ELT_SEP: '+'
{
//System.out.println("Found elt_sep: " + getText());
}
;
SEG_TERM: '\''
{
// System.out.println("Found seg_term: " + getText());
}
;
NUM: (('0'..'9')+ (',' ('0'..'9')+)? ('+' | '\''))
=> ('0'..'9')+ (',' ('0'..'9')+)?
{
//System.out.println("num_: " + getText());
}
|
((ESCAPED | ~('?' | '+' | '\'' | ',' | '\r' | '\n'))+ )
=> ( ESCAPED | ~('?' | '+' | '\'' | ',' | '\r' | '\n'))+
{
$setType(ANUM);
//System.out.println("anum: " + getText());
}
|
(WRONGLY_ESCAPED) => WRONGLY_ESCAPED
{$setType(WRONGLY_ESCAPED); }
;
protected
WRONGLY_ESCAPED: '?' ~('?' | ':' | '+' | '\'' | ',')
{
//System.out.println("Found wrong_escaped: " + getText());
}
;
protected
ESCAPED: '?'
( ',' {$setText(","); }
| '?' {$setText("?"); }
| '\'' {$setText("'"); }
| ':' {$setText(":"); }
| '+' {$setText("+"); }
)
{
//System.out.println("Found escaped: " + getText());
}
;
NEWLINE : ( "\r\n" // DOS
| '\r' // MAC
| '\n' // Unix
)
{ newline();
$setType(Token.SKIP);
}
;
非常感谢任何帮助:)。
这可能不是最好的解决方案,但我终于找到了解决问题的方法。 所以,如果有人遇到类似问题,这是我的解决方案:
我写了一个方法,如果当前标记类型匹配我的任何关键字,它将标记类型更改为 ANUM:
void ckt() throws TokenStreamException, SemanticException {
if (mKeywordList.contains(LT(1).getType())) {
LT(1).setType(ANUM);
}
}
在尝试访问 ANUM-Token:
之前,在我的解析器规则中调用了该方法xyz: "XYZ" ELT_SEP!
{ckt();}(xyz1_1a:ANUM|xyz1_1b:NUM) {lq(90,xyz1_1a,xyz1_1b,"XYZ1-1"+LQ90)}? ELT_SEP!
{ckt();}(xyz1_2a:ANUM|xyz1_2b:NUM)? {lq_(90,xyz1_2a,xyz1_2b,"XYZ1-2"+LQ90)}? ELT_SEP!
{ckt();}(xyz1_3a:ANUM|xyz1_3b:NUM)? {lq_(90,xyz1_3a,xyz1_3b,"XYZ1-3"+LQ90)}? ELT_SEP!
{ckt();}(xyz2a:ANUM|xyz2b:NUM)? {lq_(3,xyz2a,xyz2b,"XYZ2"+LQ3)}? ELT_SEP!
{ckt();}(xyz3a:ANUM|xyz3b:NUM)? {lq_(6,xyz3a,xyz3b,"XYZ3"+LQ6)}? ELT_SEP!
{ckt();}(xyz4a:ANUM|xyz4b:NUM) {lq(30,xyz4a,xyz4b,"XYZ4"+LQ30)}?
(ELT_SEP! {ckt();}(xyz5a:ANUM|xyz5b:NUM)?)? {lq_(46,xyz5a,xyz5b,"XYZ5"+LQ46)}? SEG_TERM!
{
if (skipNachricht()) return;
Xyz xyz = new Xyz();
xyz.xyz1_1 = getText(nn(xyz1_1a, xyz1_1b));
xyz.xyz1_2 = getText(nn(xyz1_2a, xyz1_2b));
xyz.xyz1_3 = getText(nn(xyz1_3a, xyz1_3b));
xyz.xyz2 = getText(nn(xyz2a, xyz2b));
xyz.xyz3 = getText(nn(xyz3a, xyz3b));
xyz.xyz4 = getText(nn(xyz4a, xyz4b));
xyz.xyz5 = getText(nn(xyz5a, xyz5b));
handleXyz(xyz);
}
;