字符串文字中换行符的换行计数器 [更新]

Question

注意：这是我之前的同一个问题，但我将其简化为所需的结构。如果仍然不好看，请告诉我格式有什么问题，以供日后参考。

我有一个 class 的项目，我必须为 TIP 制作一个词法分析器。我在计算屏幕左侧显示的换行符时出错。目前，它有一个正则表达式，每当使用 \n 或 \r 时，它都会递增 line_counter 变量。我的问题是它不接受字符串中的新行符号。该程序将显示它应该如何使用换行符，但是它不会增加计数器。

期望的输出：

如上段所述，我需要换行计数器来识别字符串文字中的换行符号。
我需要最后的行号为 20，如预期输出所示，而不是 17，如下面的实际输出所示。

到目前为止我做了什么：

如果在字符串中找到 \n 或 \r，则在字符串文字正则表达式中添加额外的 if 语句，然后增加行计数器。
将行计数器正则表达式放在规则文件中的不同位置，以确保它不会被另一个正则表达式覆盖。

错误：我没有收到任何错误。

以下是我在这个问题中引用的相关正则表达式的片段。另外，问题的底部是实际和预期的输出。

来自规则文件的片段

 /* STRING LITERAL REGEX */
[']([^'\]|\(.|\n))*[']       { if(yyleng <= 80)
                                    {
                                        return TOK_STRINGLIT; 
                                    }
                                    else
                                    {
                                        return TOK_UNKNOWN;
                                    }
                                }

 /* REGEX TO COUNT NEW LINES */
[\r\n]         { line_number++; }

预期输出

line: 16, lexeme: |'This string
has
newlines
inside of it'|, length: 43, token: 4003
line: 20, lexeme: |&|, length: 1, token: 6000
ERROR: unknown token

实际产量

line: 16, lexeme: |'This string
has
newlines 
inside of it'|, length: 43, token: 4003
line: 17, lexeme: |&|, length: 1, token: 6000 
ERROR: unknown token

可复制文件

LEXER.H

//*****************************************************************************
// CSE 4713 / 6713 Project - List of tokens for TIPS
//*****************************************************************************

#ifndef LEXER_H
#define LEXER_H

// List of token codes

// Keywords
#define TOK_BEGIN    1000
#define TOK_BREAK    1001
#define TOK_CONTINUE 1002
#define TOK_DOWNTO   1003
#define TOK_ELSE     1004
#define TOK_END      1005
#define TOK_FOR      1006
#define TOK_IF       1007
#define TOK_LET      1008
#define TOK_PROGRAM  1009
#define TOK_READ     1010
#define TOK_THEN     1012
#define TOK_TO       1013
#define TOK_VAR      1014
#define TOK_WHILE    1015
#define TOK_WRITE    1016

// Datatype Specifiers
#define TOK_INTEGER  1100
#define TOK_REAL     1101

// Punctuation
#define TOK_SEMICOLON  2000
#define TOK_COLON      2001
#define TOK_OPENPAREN  2002
#define TOK_CLOSEPAREN 2003
#define TOK_OPENBRACE  2004
#define TOK_CLOSEBRACE 2005

// Operators
#define TOK_PLUS        3000
#define TOK_MINUS       3001
#define TOK_MULTIPLY    3002
#define TOK_DIVIDE      3003
#define TOK_ASSIGN      3004
#define TOK_EQUALTO     3005
#define TOK_LESSTHAN    3006
#define TOK_GREATERTHAN 3007
#define TOK_NOTEQUALTO  3008
#define TOK_MOD         3009
#define TOK_NOT         3010
#define TOK_OR          3011
#define TOK_AND         3012

// Useful abstractions
#define TOK_IDENT       4000  // identifier
#define TOK_INTLIT      4001  // integer literal
#define TOK_FLOATLIT    4002  // floating point literal
#define TOK_STRINGLIT   4003  // string literal
#define TOK_EOF         5000  // end of file
#define TOK_EOF_SL      5001  // end of file while parsing a string literal
#define TOK_UNKNOWN     6000  // unknown lexeme

#endif

DRIVER.CPP

//*****************************************************************************
// CSE 4713 / 6713 Project Part 1 - Lexical Analyzer Driver
// Fall 2020
//*****************************************************************************

#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#endif

#include <stdio.h>
#include "lexer.h"


// Instantiate global variables
extern "C"
{
extern FILE *yyin;         // input stream
extern FILE *yyout;        // output stream
extern int   yyleng;       // length of current lexeme
extern char *yytext;       // text of current lexeme
extern int   yylex();      // the generated lexical analyzer

extern int   line_number;  // current line number of the input
}

// Do the analysis
int main( int argc, char* argv[] ) {
  int token;   // hold each token code

  // Set the input stream
  if (argc > 1) {
    printf("INFO: Using the file %s for input\n", argv[1]);
    yyin = fopen(argv[1], "r");
    if (!yyin) {
      printf("   ERROR: input file not found\n");
      return (-1);
    }
  }
  else {
    printf("INFO: Using stdin for input, use EOF to end input\n");
    printf("      Windows EOF is Ctrl+z, Linux EOF is Ctrl+d\n");
    yyin = stdin;
  }

  // Set the output stream
  yyout = stdout;
  
  // Do the lexical parsing
  token = yylex();
  while( token != TOK_EOF ) 
  {
    // What did we find?
    fprintf(yyout, "line: %d, lexeme: |%s|, length: %d, token: %d\n", 
                        line_number, yytext, yyleng, token);
    
    // Is it an error?
    if( token == TOK_UNKNOWN )
      fprintf(yyout,"   ERROR: unknown token\n");
    if( token == TOK_EOF_SL )
      fprintf(yyout,"   ERROR: end of file while in a string literal\n");
    
    // Get the next token
    token = yylex();
  }
  return 0;
}

RULES.L

/******************************************************************* 
Starting point your rules.l file for TIPS
Name: Stephanie Schisler                NetID: sas880
Course: CSE 4713                        Assignment: Part 1
Programming Environment: WSL C++
Purpose of File: Contains the rules for the project.
*******************************************************************/
%option noyywrap
%{
#include "lexer.h"

// global variable to hold current line number being read
int line_number = 1;

%}

%%

 /* Keywords */ 
BEGIN           { return TOK_BEGIN; }
BREAK           { return TOK_BREAK; }
CONTINUE        { return TOK_CONTINUE; }
DOWNTO          { return TOK_DOWNTO; }
ELSE            { return TOK_ELSE; }
END             { return TOK_END; }
FOR             { return TOK_FOR; }
IF              { return TOK_IF; }
LET             { return TOK_LET; }
PROGRAM         { return TOK_PROGRAM; }
READ            { return TOK_READ; }
THEN            { return TOK_THEN; }
TO              { return TOK_TO; }
VAR             { return TOK_VAR; }
WHILE           { return TOK_WHILE; }
WRITE           { return TOK_WRITE; }

 /* Datatype Specifiers */
INTEGER         { return TOK_INTEGER; }
REAL            { return TOK_REAL; }

 /* Punctuation */
\;           { return TOK_SEMICOLON; }
\:           { return TOK_COLON; }
\(          { return TOK_OPENPAREN; }
\)          { return TOK_CLOSEPAREN; }
\{          { return TOK_OPENBRACE; }
\}          { return TOK_CLOSEBRACE; }

 /* Operators */
\+          { return TOK_PLUS; }
-           { return TOK_MINUS; }
\*          { return TOK_MULTIPLY; }
\/          { return TOK_DIVIDE; }
\:=          { return TOK_ASSIGN; }
\=           { return TOK_EQUALTO; }
\<           { return TOK_LESSTHAN; }
\>           { return TOK_GREATERTHAN; }
\<>          { return TOK_NOTEQUALTO; }
MOD         { return TOK_MOD; }
NOT         { return TOK_NOT; }
OR          { return TOK_OR; }
AND         { return TOK_AND; }

 /* Abstractions */
[A-Z][0-9A-Z]{0,7}           { return TOK_IDENT; }       
[0-9]+                       { return TOK_INTLIT; }      
[0-9]+[.]?[0-9]+             { return TOK_FLOATLIT; }


[']([^'\]|\(.|\n))*[']       { if(yyleng <= 80)
                                    {
                                        return TOK_STRINGLIT; 
                                    }
                                    else
                                    {
                                        return TOK_UNKNOWN;
                                    }
                                }

"\[[^"\]|\(.|\n)]*|'\[[^'\]|\(.|\n)]*            { return TOK_EOF_SL; }

 /* Count new lines */
[\r\n]         { line_number++; }

 /* Eat any whitespace */
[\t ]*

 /* Found an unknown character */

.         { return TOK_UNKNOWN; }

 /* Recognize end of file */

<<EOF>>   { return TOK_EOF; }

输入文件


ABCDEFGH
ABCDEFGHIJ
AB123 123AB A123ZZ  SUM  IFFINESS
AB_123
ab_123
123 
3219012894910
12.132 
.123
0.132
-123 -12.324
%%%%%%%
^
'This is a string'
'This string    has tabs        inside of it.'
'This string
 has 
 newlines
 inside of it'
&

正确的输出


ABCDEFGH
ABCDEFGHIJ
AB123 123AB A123ZZ  SUM  IFFINESS
AB_123
ab_123
123 
3219012894910
12.132 
.123
0.132
-123 -12.324
%%%%%%%
^
'This is a string'
'This string    has tabs        inside of it.'
'This string
 has 
 newlines
 inside of it'
&

生成文件

###############################################################################
# CSE 4713 / 6713 Project Part 1 - Lexical Analyzer (flex)
#
# 'make'        build executable file
# 'make clean'  removes all intermediate (lex.yy.c and *.o) and executable files
#
# This makefile purposely avoids macros to make the rules more clear.
# For more information about makefiles:
#      http://www.cs.colby.edu/maxwell/courses/tutorials/maketutor/
#      http://www.cs.swarthmore.edu/~newhall/unixhelp/howto_makefiles.html
#      http://www.gnu.org/software/make/manual/make.html
#
###############################################################################

lex.exe: lex.yy.o driver.o
    g++ -g -o lex.exe lex.yy.o driver.o

driver.o: driver.cpp lexer.h
    g++ -g -o driver.o -c driver.cpp

lex.yy.o: lex.yy.c lexer.h
    gcc -g -o lex.yy.o -c lex.yy.c

lex.yy.c: rules.l lexer.h
    flex -o lex.yy.c rules.l

clean: 
    $(RM) *.o lex.yy.c lex.exe

Answer 1

你的行计数器没有被字符串中的换行符增加，因为你对字符串模式的操作没有改变行计数器。

(F)lex 词法分析器根据您提供的模式将输入划分为标记，并针对每个标记执行关联的操作。模式在其他模式中不匹配：这会导致混乱。（例如，tiffany 不包含 if 标记。它是一个不可分割的标识符。）

从使用 flex 构建的词法分析器中获取准确行数的最简单方法是包含选项

 %option yylineno

在您的序言中（第一个 %% 之前的 flex 输入文件部分）。一旦你这样做了，flex 就会为你做所有的事情，并且 yylineno 将始终包含行号计数。（它包含标记的 end 处的行号计数，这很重要：如果您想知道多行标记从哪一行开始，您需要执行（非常少）多一点。）

可能有人告诉您不要使用 yylineno。（就个人而言，我认为这样的作业限制是错误的，但我并不总是看到 eye-to-eye 与讲师。）如果是这种情况，你需要做 flex 会自动为你做的事情，这重新扫描任何可能包含换行符的标记以计算它包含的换行符的数量，如果有的话：

[']([^'\]|\(.|\n))*[']       { for (const char* p = yytext; *p; ++p) {
                                   if (*p == '\n') ++line_number;
                                 }
                                 /* Rest of the string action */ 
                                 ...

您可以通过使用开始条件来处理字符串字面量。例如，您可以将该代码更改为：

%x STRING_LITERAL
%%
[']                  { yymore(); BEGIN(STRING_LITERAL); }
<STRING_LITERAL>{
  [^'\\n]+          { yymore(); }
  \?\r?\n           { ++line_number; yymore(); }
  \.                { yymore(); }
  [']                { BEGIN(INITIAL); /* Return to the normal scan */
                       /* At this point, yytext and yyleng refer to the
                        * entire token, including the ' marks. So you can
                        * now do exactly what you did in your string literal
                        * action. But see below for some comments.
                        */
                        ...
                      }
}
   /* Other lexical rules continue here */

查看 flex manual section on start conditions 以获取有关启动条件如何工作的详细说明。

请注意，除了最后一个实际接受令牌的操作外，字符串令牌中的所有操作都包含对 yymore() 的调用。 yymore() 是一个特殊的 (f)lex 动作，它告诉分析器令牌尚未完成，下一个模式将匹配当前令牌的另一部分。

关于您的代码的一些旁注：

您的 end-of-line 模式 [\r\n] 匹配 \r 或 \n。 Windows 行结尾实际上是 two-character 序列 \r\n，所以如果遇到一个，你最终会递增 line_number 两次。（但是，您不太可能遇到这种情况，除非您以二进制模式打开输入文件，因为标准库在读取该行时应该删除 \r。）匹配 [= 的正确模式81=] 或 Unix line-ending 序列是
```
 \r?\n
```
这将匹配整个 line-ending。有时您会看到更精细的图案，如果您的讲师关心“永不再见的历史文物博物馆”，他们可能会要求您处理本世纪未曾使用过的约定，例如 pre-OS-X Apple 约定只使用 \r。我的建议是不要接受它。只需计算 \n s，无论它们前面是否有 \r。这在您可能遇到的任何系统上都是正确的。
您对过长字符串的测试不精确（我想我们已经在另一个问题中提到了这个事实）。首先，它计算字符串文字长度中的引号，这可能是不正确的。其次，它根据转义序列本身的长度来计算转义序列，而不是转义序列被翻译成的单个字符。这将导致将 a 计为单个字符，但在语义上等同的 \x61 计为四个字符，这很可能导致完全有效的 less-than-80-character-long 文字被你的词法扫描器拒绝。如果您使用 start-condition 解决方案来计算上面建议的行数，您还可以通过保持单独的多余字符计数来更正文字的计算长度。（您不能更改令牌内部的 yyleng，因为 yymore() 取决于开始能够维护它和 yytext。请参阅上面的手册 link。）

字符串文字中换行符的换行计数器 [更新]

New line counter for new line characters in string literals [Updated]

c++

regex

lexical-analysis

flex-lexer