BISON + FLEX 语法 - 为什么标记被连接在一起

BISON + FLEX grammar - why tokens are being concatenated together

我想了解为什么 BISON 按照以下规则连接两个标记

stmt:
  declaration                 { ... }
  | assignment                { ... }
  | exp                       { ... }
  | ID ';'  <-- this rule     { ...       
                                fprintf(stderr, "\n my id is '%s'", );
                                ... 

如果你检查输出就会明白我的意思。我 运行 我的解析器并将字符 ab; 输入程序。根据我的野牛语法,这应该被解析为 ID 后跟 ;。在某种程度上,这就是发生的事情。

但是,当我尝试使用规则 ID ';'</code> 变量时,程序向我输出 <code>ab; 而不是 ab

运行正在运行程序

ab;                                   <-- this my input to the program

#TOKEN 294[ID] yytext -> ab
Next token is token "identifier" (1.1: )
Shifting token "identifier" (1.1: )
Entering state 5
Reading a token:
#TOKEN 59[] yytext -> ;
Next token is token ';' (1.1: )
Shifting token ';' (1.1: )
Entering state 16
Reducing stack by rule 6 (line 133):
    = token "identifier" (1.1: )     <-- first token which is 'ab'
    = token ';' (1.1: )              <-- second token which is ';'

[stmt] 4:
 my id is 'ab;'                        <-- the issue! This should be 'ab' not 'ab;'   
ERROR: No such ID ab; found
-> $$ = nterm stmt (1.1: )
Stack now 0 1
Entering state 10
Reducing stack by rule 2 (line 126):
    = nterm prog (1.1: )
    = nterm stmt (1.1: )
-> $$ = nterm prog (1.1: )
Stack now 0
Entering state 1
Reading a token:

语法

%{
#include <stdio.h>
#include <string>
#include <map>
#include <math.h>
#include "noname-parse.h"
#include "noname-types.h"

extern int yylex(void);
extern void yyerror(const char *error_msg);
extern void division_by_zero(YYLTYPE &yylloc);


std::map<std::string, symrec*> symbol_table;
std::map<std::string, symrec*>::iterator symbol_table_it;
%}

//////////////////////////////////////////////////
///////////* Bison declarations.  *///////////////
//////////////////////////////////////////////////

%union {

  char* id_v;
  double double_v;
  long long_v;

  symrecv symrecv;
  char* error_msg;
};

%{

  bool symbol_exist(const char* key) {
    std::string skey = key;
    symbol_table_it = symbol_table.find(skey);
    return  (symbol_table_it != symbol_table.end());
  }

  void symbol_insert(const char* key, symrecv symrecv) {
    std::string skey = key;
    symbol_table[skey] = symrecv;
  }

  symrecv symbol_retrieve(const char* key) {
    std::string skey = key;
    return symbol_table[skey];
  }

  void print_stmt(symrecv sym) {

    if (sym->type == TYPE_LONG) {
      fprintf(stderr, "%d", sym->value.intv);

    } else if (sym->type == TYPE_DOUBLE) {
      fprintf(stderr, "%lf", sym->value.doublev);

    } else {
      fprintf(stderr, "print not implemented for type %d", sym->type);
    }
  }
%}

%token LINE_BREAK            "line_break"             
// %token ';'              "stmt_sep"           
%token LETTER                "letter"         
%token DIGIT                 "digit"         
%token DIGITS                "digits"         
%token DARROW                "darrow"         
%token ELSE                  "else"       
%token FALSE                 "false"         
%token IF                    "if"     
%token IN                    "in"     
%token LET                   "let"       
%token LOOP                  "loop"       
%token THEN                  "then"       
%token WHILE                 "while"         
%token BREAK                 "break"         
%token CASE                  "case"       
%token NEW                   "new"       
%token NOT                   "not"       
%token TRUE                  "true"       
%token NEWLINE               "newline"           
%token NOTNEWLINE            "notnewline"             
%token WHITESPACE            "whitespace"             
%token LE                    "le"     
%token ASSIGN                "assign"         
%token NULLCH                "nullch"         
%token BACKSLASH             "backslash"             
%token STAR                  "star"       
%token NOTSTAR               "notstar"           
%token LEFTPAREN             "leftparen"             
%token NOTLEFTPAREN          "notleftparen"               
%token RIGHTPAREN            "rightparen"             
%token NOTRIGHTPAREN         "notrightparen"                 
%token LINE_COMMENT          "line_comment"               
%token START_COMMENT         "start_comment"                 
%token END_COMMENT           "end_comment"               
%token QUOTES                "quotes"         
%token ERROR                 "error"

%token <id_v> ID             "identifier"
%token <double_v> DOUBLE     "double"
%token <long_v> LONG         "long"
%type  <symrecv> assignment  "assignment"
%type  <symrecv> declaration "declaration"
%type  <symrecv> exp         "expression"
%type  <symrecv> stmt        "statement"

%left '-' '+'
%left '*' '/'
%left LET ID 
%right '^'        /* exponentiation */
%precedence NEG   /* negation--unary minus */

%start prog

%% 

//////////////////////////////////////////////////
///////////* The grammar follows. *///////////////
//////////////////////////////////////////////////

prog:
  %empty
  | prog stmt
;

stmt:
  declaration        { fprintf(stderr, "\n[stmt] 2: "); print_stmt(); }
  | assignment       { fprintf(stderr, "\n[stmt] 3: "); print_stmt(); }
  | exp              { fprintf(stderr, "\n[stmt] 1: "); print_stmt(); }
  | ID ';'           { fprintf(stderr, "\n[stmt] 4: "); 

    fprintf(stderr, "\n my id is '%s'", );

    $$ = (symrec *) malloc (sizeof (symrec));

    if (!symbol_exist()) {

      char buf[1024];
      sprintf(buf, "No such ID %s found", );
      yyerror(buf);

    } else {

      $$->name = ;
      $$->value.doublev = symbol_retrieve()->value.doublev;
      printf("\nID %s -> %lf", , $$->value.doublev);
    }
  }
  | error            { printf("%d:%d", @1.first_column, @1.last_column); }
;

assignment:
  ID ASSIGN exp ';' {

    $$ = (symrec *) malloc (sizeof (symrec));

    if (!symbol_exist()) {

      char buf[1024];
      sprintf(buf, "No such ID %s found", );
      yyerror(buf);

    } else {

      $$->name = ;
      $$->type = ->type;
      $$->value.doublev = ->value.doublev;
      symbol_insert(, $$);
      // printf("\nID %s -> %lf", , $$->value.doublev);
      printf("\n[assignment]");
    }
  }
  | LET ID ASSIGN exp ';' {

    $$ = (symrec *) malloc (sizeof (symrec));

    if (symbol_exist()) {

      char buf[1024];
      sprintf(buf, "Cannot redefine ID %s", );
      yyerror(buf);

    } else {

      $$->name = ;
      $$->type = ->type;
      $$->value.doublev = ->value.doublev;
      symbol_insert(, $$);
      // printf("\nID %s -> %lf", , $$->value.doublev);
      printf("\n[assignment]");
    }
  }
;

declaration:
  LET ID ';' {

    $$ = (symrec *) malloc (sizeof (symrec));

    if (symbol_exist()) {

      char buf[1024];
      sprintf(buf, "Cannot redefine ID %s", );
      yyerror(buf);

    } else {

      $$->name = ;
      // $$->type = ->type == TYPE_DOUBLE || ->type == TYPE_DOUBLE ? TYPE_DOUBLE : ->type;
      symbol_insert(, $$);
      // $$->value.doublev = symbol_table_it->second->value.doublev;
      // printf("\nID %s -> %lf", , $$->value.doublev);
      printf("\n[declaration]");
    }
  }
;

exp:
  LONG {
    $$ = (symrec *) malloc (sizeof (symrec));
    $$->name = (char*) "__annon";
    $$->type = TYPE_LONG;
    $$->value.intv = ;
    printf("\nexp %ld", );
  }
  | DOUBLE {
    $$ = (symrec *) malloc (sizeof (symrec));
    $$->name = (char*) "__annon";
    $$->type = TYPE_DOUBLE;
    $$->value.doublev = ;
    printf("\nexp %lf", );
  }
  | exp '+' exp        {
      // $$ =  + ;
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = ->type == TYPE_DOUBLE || ->type == TYPE_DOUBLE ? TYPE_DOUBLE : ->type;
      $$->value.doublev = ->value.doublev + ->value.doublev;
      printf("\nexp + exp %lf %lf", ->value.doublev, ->value.doublev);
    }
  | exp '-' exp        {
      // $$ =  - ;
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = ->type == TYPE_DOUBLE || ->type == TYPE_DOUBLE ? TYPE_DOUBLE : ->type;
      $$->value.doublev = ->value.doublev - ->value.doublev;
      printf("\nexp - exp %lf %lf", ->value.doublev, ->value.doublev);
    }
  | exp '*' exp        {
      // $$ =  * ;
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = ->type == TYPE_DOUBLE || ->type == TYPE_DOUBLE ? TYPE_DOUBLE : ->type;
      $$->value.doublev = ->value.doublev * ->value.doublev;
      printf("\nexp * exp %lf %lf", ->value.doublev, ->value.doublev);
    }
  | exp '/' exp {
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = ->type == TYPE_DOUBLE || ->type == TYPE_DOUBLE ? TYPE_DOUBLE : ->type;

      if (->value.doublev) {
        // $$ =  / ;
        $$->value.doublev = ->value.doublev / ->value.doublev;
      } else {
        // $$ = ;
        $$->value.doublev = ->value.doublev;
        division_by_zero(@3);
      }
      printf("\nexp / exp %lf %lf", ->value.doublev, ->value.doublev);
    }
  | '-' exp  %prec NEG {
      /**
        * The %prec simply instructs Bison that the rule ‘| '-' exp’ 
        * has the same precedence as NEG—in this case the next-to-highest
        */
      // $$ = -(->value.doublev);
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = ->type;
      $$->value.doublev = -->value.doublev;
      printf("\nexp ^ exp %lf", ->value.doublev);
    }
  | exp '^' exp        {
      //$$ = pow(->value.doublev, ->value.doublev);
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = ->type;
      $$->value.doublev = pow(->value.doublev, ->value.doublev);
      printf("\nexp ^ exp %lf %lf", ->value.doublev, ->value.doublev);
    }
  | '(' exp ')'        {
      // $$ = ->value.doublev;
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = ->type;
      $$->value.doublev = ->value.doublev;
      printf("\n(exp) %lf", ->value.doublev);
    }
  | error                 { printf("\nERROR on exp rule"); }
  ;
%%

词法分析器

%{
  #include "stdio.h"
  #include "stdlib.h"
  #include "lexer-utilities.h"
  #include "noname-parse.h"
  #include "noname-types.h"

  int num_lines = 0, num_chars = 0;
  extern YYSTYPE yylval;
  extern void yyerror(char const *s);

  extern int curr_lineno;
  extern int verbose_flag;

  unsigned int comment = 0;
%}

%option noyywrap 
  // %option noyywrap nounput batch debug yylineno
  // %option warn noyywrap nodefault yylineno reentrant bison-bridge 

%x COMMENT
%x STRING

LINE_BREAK      \n
LETTER          [a-zA-Z]
ALPHA           [a-zA-Z$_]
DIGIT           [0-9]
DIGITS          {DIGIT}+
LONG            {DIGIT}+
DOUBLE          {DIGIT}+(\.{DIGIT}+)?
ID              {ALPHA}({ALPHA}|{DIGIT})*

ELSE            [eE][lL][sS][eE]
FALSE           f[aA][lL][sS][eE]
IF              [iI][fF]
IN              [iI][nN]
LET             [lL][eE][tT]
LOOP            [lL][oO][oO][pP]
THEN            [tT][hH][eE][nN]
WHILE           [wW][hH][iI][lL][eE]
BREAK           [bB][rR][eE][aA][kK]
CASE            [cC][aA][sS][eE]
NEW             [nN][eE][wW]
NOT             [nN][oO][tT]
TRUE            t[rR][uU][eE]
NEWLINE         [\n]
NOTNEWLINE      [^\n]
WHITESPACE      [ \t\r\f\v]+
ASSIGN          =
LE              <=
DARROW          =>
NULLCH          [[=13=]]
BACKSLASH       [\]
STAR            [*]
NOTSTAR         [^*]
LEFTPAREN       [(]
NOTLEFTPAREN    [^(]
RIGHTPAREN      [)]
NOTRIGHTPAREN   [^)]

LINE_COMMENT    "--"
START_COMMENT   "/*"
END_COMMENT     "*/"

QUOTES          \"


%%

{LINE_BREAK}                    {
                                  ++num_chars;
                                  ++num_lines;
                                }

{START_COMMENT} {
  comment++;
  BEGIN(COMMENT);
}

<COMMENT><<EOF>> {
  yylval.error_msg = "EOF in comment";
  BEGIN(INITIAL);
  return (ERROR);
}

<COMMENT>{BACKSLASH}(.|{NEWLINE}) {
  backslash_common();
};

<COMMENT>{BACKSLASH}               ;

<COMMENT>{START_COMMENT} {
  comment++;
}

<COMMENT>{END_COMMENT} {
  comment--;
  if (comment == 0) {
    BEGIN(INITIAL);
  }
}

<COMMENT>.                      { ++num_chars; }

<INITIAL>{END_COMMENT} {
  yylval.error_msg = "Unmatched */";
  return (ERROR);
}

<*>{WHITESPACE}                  { ++num_chars; }
<INITIAL>{ASSIGN}                { return (ASSIGN); }
<INITIAL>{ELSE}                  { return (ELSE); }
<INITIAL>{IF}                    { return (IF); }
<INITIAL>{IN}                    { return (IN); }
<INITIAL>{LET}                   { return (LET); }
<INITIAL>{THEN}                  { return (THEN); }
<INITIAL>{WHILE}                 { return (WHILE); }
<INITIAL>{CASE}                  { return (CASE); }
<INITIAL>{NEW}                   { return (NEW); }
<INITIAL>{NOT}                   { return (NOT); }
<INITIAL>{ID}      {
  yylval.id_v = yytext;
  return (ID); }
<INITIAL>{LONG}     {
  yylval.long_v = atoi(yytext);
  return (LONG); }
<INITIAL>{DOUBLE}  {
  yylval.double_v = atof(yytext);
  return (DOUBLE); }

<INITIAL>","                     { return int(','); }
<INITIAL>":"                     { return int(':'); }
<INITIAL>"{"                     { return int('{'); }
<INITIAL>"}"                     { return int('}'); }
<INITIAL>"+"                     { return int('+'); }
<INITIAL>"-"                     { return int('-'); }
<INITIAL>"*"                     { return int('*'); }
<INITIAL>"/"                     { return int('/'); }
<INITIAL>"<"                     { return int('<'); }
<INITIAL>"~"                     { return int('~'); }
<INITIAL>"."                     { return int('.'); }
<INITIAL>"@"                     { return int('@'); }
<INITIAL>"("                     { return int('('); }
<INITIAL>")"                     { return int(')'); }
<INITIAL>"&"                     { return int('&'); }
<INITIAL>";"                     { return int(';'); }

<INITIAL>. {
    printf("lexer error '%s'", yytext);
    yylval.error_msg = yytext; return 0; 
  }

%%

这个伸缩动作不正确:

  yylval.id_v = yytext;

yytext 指向内部工作缓冲区。每次调用扫描仪时,其内容都会发生变化。所以如果你想保留组成令牌的字符串,你必须将字符串复制到你自己的存储中,例如使用 strdup。 (完成后不要忘记释放分配的存储空间。)