如何处理 c 中的换行符和分隔符间距?
How do I handle the newline, and separator spacing in c?
实际的文本文件只是用来测试 lex 和解析的随机内容。上面的图片是结果,控制台在运行时给我。在绿色中,它在应该是换行符或分隔符时调用标识符,因此不需要任何东西。在红色中它不识别分隔符,在黄色中它根本不读取 something.something。我假设它与之前的 c 有关系;没有被正确分开。
所以我的问题是如何正确地分隔标记,并识别换行符,或者我做错了什么。下面是我用来进行分离和标记化的代码。
#define _CRT_SECURE_NO_WARNINGS
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define BUFFER_SIZE 1024
// Returns 'true' if the character is a DELIMITER.
bool isDelimiter(char ch)
{
if (isspace (ch))
return (true);
return (false);
}
// Returns 'true' if the character is a SEPERATOR.
bool isSeperator(char str)
{
if (str == ',' || str == ';' || str == '>' ||
str == '<' || str == '(' || str == ')' || str == '[' || str == ']' ||
str == '{' || str == '}' || str == '.' )
return (true);
return (false);
}
// Returns 'true' if the character is an OPERATOR.
bool isOperator(char ch)
{
if (ch == '+' || ch == '-' || ch == '*' ||
ch == '/' || ch == '>' || ch == '<' ||
ch == '=')
return (true);
return (false);
}
// Returns 'true' if the string is a VALID IDENTIFIER.
bool validIdentifier(char* str)
{
if (str[0] == '0' || str[0] == '1' || str[0] == '2' ||
str[0] == '3' || str[0] == '4' || str[0] == '5' ||
str[0] == '6' || str[0] == '7' || str[0] == '8' ||
str[0] == '9' )
return (false);
return (true);
}
// Returns 'true' if the string is a KEYWORD.
bool isKeyword(char* str)
{
if (!strcmp(str, "if") || !strcmp(str, "else") ||
!strcmp(str, "while") || !strcmp(str, "do") ||
!strcmp(str, "break") || !strcmp(str, "elem") ||
!strcmp(str, "lout") || !strcmp(str, "file") ||
!strcmp(str, "console") || !strcmp(str, "read") ||
!strcmp(str, "write") || !strcmp(str, "mark") ||
!strcmp(str, "emblemnize") || !strcmp(str, "lin") ||
!strcmp(str, "send") || !strcmp(str, "dint") ||
!strcmp(str, "continue") || !strcmp(str, "int")
|| !strcmp(str, "double") || !strcmp(str, "float")
|| !strcmp(str, "return") || !strcmp(str, "char")
|| !strcmp(str, "case") || !strcmp(str, "char")
|| !strcmp(str, "sizeof") || !strcmp(str, "long")
|| !strcmp(str, "short") || !strcmp(str, "typedef")
|| !strcmp(str, "switch") || !strcmp(str, "unsigned")
|| !strcmp(str, "void") || !strcmp(str, "static")
|| !strcmp(str, "struct") || !strcmp(str, "goto"))
return (true);
return (false);
}
// Returns 'true' if the string is an INTEGER.
bool isInteger(char* str)
{
int i, len = strlen(str);
if (len == 0)
return (false);
for (i = 0; i < len; i++) {
if (str[i] != '0' && str[i] != '1' && str[i] != '2'
&& str[i] != '3' && str[i] != '4' && str[i] != '5'
&& str[i] != '6' && str[i] != '7' && str[i] != '8'
&& str[i] != '9' || (str[i] == '-' && i > 0))
return (false);
}
return (true);
}
// Returns 'true' if the string is a REAL NUMBER.
bool isRealNumber(char* str)
{
int i, len = strlen(str);
bool hasDecimal = false;
if (len == 0)
return (false);
for (i = 0; i < len; i++) {
if (str[i] != '0' && str[i] != '1' && str[i] != '2'
&& str[i] != '3' && str[i] != '4' && str[i] != '5'
&& str[i] != '6' && str[i] != '7' && str[i] != '8'
&& str[i] != '9' && str[i] != '.' ||
(str[i] == '-' && i > 0))
return (false);
if (str[i] == '.')
hasDecimal = true;
}
return (hasDecimal);
}
// Extracts the SUBSTRING.
char* subString(char* str, int left, int right)
{
int i;
char* subStr = (char*)malloc(sizeof(char) * (right - left + 2));
for (i = left; i <= right; i++)
subStr[i - left] = str[i];
subStr[right - left + 1] = '[=10=]';
return (subStr);
}
// Parsing the input STRING.
void parse(char* str)
{
int left = 0, right = 0;
int len = strlen(str);
while (right <= len && left <= right)
{
if (isDelimiter(str[right]) == false)
right++;
if (isDelimiter(str[right]) == true && left == right)
{
if (isOperator(str[right]) == true)
printf("'%c' IS A OPERATOR\n", str[right]);
right++;
left = right;
}
if (isDelimiter(str[right]) == true && left == right)
{
if (isDelimiter(str[right]) == true)
printf("'%c' IS A DELIMITER\n", str[right]);
right++;
left = right;
}
if (isSeperator(str[right]) == true && left == right)
{
//needed to recognize seperator to the right
if (isSeperator(str[right]) == true)
printf("'%c' IS A SEPERATOR\n", str[right]);
right++;
left = right;
//needed to recognize seperator to the left
if (isSeperator(str[right]) == true)
printf("'%c' IS A SEPERATOR\n", str[left]);
right++;
left = right;
}
else if (isDelimiter(str[right]) == true && left != right
|| (right == len && left != right)) {
char* subStr = subString(str, left, right - 1);
if (isKeyword(subStr) == true)
printf("'%s' IS A KEYWORD\n", subStr);
else if (isInteger(subStr) == true)
printf("'%s' IS AN INTEGER\n", subStr);
else if (isRealNumber(subStr) == true)
printf("'%s' IS A REAL NUMBER\n", subStr);
else if (validIdentifier(subStr) == true
&& isDelimiter(str[right - 1]) == false
&& isSeperator(str[right - 1]) == false)
printf("'%s' IS A VALID IDENTIFIER\n", subStr);
left = right;
}
}
return;
}
int main(int argc, char *argv)
{
/* declare a file pointer */
FILE *file;
char *buffer;
long numbytes;
/* open an existing file for reading */
file = fopen("Text.txt", "r");
/* quit if the file does not exist */
if (file == NULL)
return 1;
/* Get the number of bytes */
fseek(file, 0L, SEEK_END);
numbytes = ftell(file);
/* reset the file position indicator to
the beginning of the file */
fseek(file, 0L, SEEK_SET);
/* grab sufficient memory for the
buffer to hold the text */
buffer = (char*)calloc(numbytes, sizeof(char));
/* memory error */
if (buffer == NULL)
return 1;
/* copy all the text into the buffer */
fread(buffer, sizeof(char), numbytes, file);
/* confirm we have read the file by
outputing it to the console */
printf(" The file called Text.txt contains this text \n \n %s \n\n", buffer);
parse(buffer); // calling the parse function
fclose(file);
/* free the memory we used for the buffer */
free(buffer);
return 0;
}
看起来问题是您的 isDelimiter
函数没有获取所有可能的值。如果您将其更改为使用 isspace()
,它将匹配所有形式的空格。
bool isDelimiter(char ch)
{
if (isspace(ch))
return (true);
return (false);
}
例如,这是一个非常简单的状态机,可以让您了解我的意思。它可以处于两种状态之一 - INSIDE_IDENTIFIER 或 OUTSIDE_IDENTIFIER - 它会根据正在查看的字符类型在这两种状态之间切换。
#define OUTSIDE_IDENTIFIER (0)
#define INSIDE_IDENTIFIER (1)
void parse(char *str)
{
char *ch;
int state=OUTSIDE_IDENTIFIER;
char buffer[1000];
char *pos=buffer;
for(ch=str;*ch!='[=11=]';ch++)
{
switch(state)
{
case INSIDE_IDENTIFIER:
if(isOperator(*ch))
{
*pos='[=11=]';
printf("Identifier[%s]\n",buffer);
printf("Operator[%c]\n",*ch);
state=OUTSIDE_IDENTIFIER;
}
else if(isDelimiter(*ch))
{
*pos='[=11=]';
printf("Identifier[%s]\n",buffer);
printf("Delimiter[%c]\n",*ch);
state=OUTSIDE_IDENTIFIER;
}
else if(isspace(*ch))
{
*pos='[=11=]';
printf("Identifier[%s]\n",buffer);
printf("Space[%d]\n",*ch);
state=OUTSIDE_IDENTIFIER;
}
else
{
*pos=*ch;
pos++;
}
break;
case OUTSIDE_IDENTIFIER:
default:
if(isOperator(*ch))
{
printf("Operator[%c]\n",*ch);
}
else if(isDelimiter(*ch))
{
printf("Delimiter[%c]\n",*ch);
}
else if(isSeperator(*ch))
{
printf("Seperator[%c]\n",*ch);
}
else
{
state = INSIDE_IDENTIFIER;
pos=buffer;
*pos=*ch;
pos++;
}
break;
}
}
}
实际的文本文件只是用来测试 lex 和解析的随机内容。上面的图片是结果,控制台在运行时给我。在绿色中,它在应该是换行符或分隔符时调用标识符,因此不需要任何东西。在红色中它不识别分隔符,在黄色中它根本不读取 something.something。我假设它与之前的 c 有关系;没有被正确分开。
所以我的问题是如何正确地分隔标记,并识别换行符,或者我做错了什么。下面是我用来进行分离和标记化的代码。
#define _CRT_SECURE_NO_WARNINGS
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define BUFFER_SIZE 1024
// Returns 'true' if the character is a DELIMITER.
bool isDelimiter(char ch)
{
if (isspace (ch))
return (true);
return (false);
}
// Returns 'true' if the character is a SEPERATOR.
bool isSeperator(char str)
{
if (str == ',' || str == ';' || str == '>' ||
str == '<' || str == '(' || str == ')' || str == '[' || str == ']' ||
str == '{' || str == '}' || str == '.' )
return (true);
return (false);
}
// Returns 'true' if the character is an OPERATOR.
bool isOperator(char ch)
{
if (ch == '+' || ch == '-' || ch == '*' ||
ch == '/' || ch == '>' || ch == '<' ||
ch == '=')
return (true);
return (false);
}
// Returns 'true' if the string is a VALID IDENTIFIER.
bool validIdentifier(char* str)
{
if (str[0] == '0' || str[0] == '1' || str[0] == '2' ||
str[0] == '3' || str[0] == '4' || str[0] == '5' ||
str[0] == '6' || str[0] == '7' || str[0] == '8' ||
str[0] == '9' )
return (false);
return (true);
}
// Returns 'true' if the string is a KEYWORD.
bool isKeyword(char* str)
{
if (!strcmp(str, "if") || !strcmp(str, "else") ||
!strcmp(str, "while") || !strcmp(str, "do") ||
!strcmp(str, "break") || !strcmp(str, "elem") ||
!strcmp(str, "lout") || !strcmp(str, "file") ||
!strcmp(str, "console") || !strcmp(str, "read") ||
!strcmp(str, "write") || !strcmp(str, "mark") ||
!strcmp(str, "emblemnize") || !strcmp(str, "lin") ||
!strcmp(str, "send") || !strcmp(str, "dint") ||
!strcmp(str, "continue") || !strcmp(str, "int")
|| !strcmp(str, "double") || !strcmp(str, "float")
|| !strcmp(str, "return") || !strcmp(str, "char")
|| !strcmp(str, "case") || !strcmp(str, "char")
|| !strcmp(str, "sizeof") || !strcmp(str, "long")
|| !strcmp(str, "short") || !strcmp(str, "typedef")
|| !strcmp(str, "switch") || !strcmp(str, "unsigned")
|| !strcmp(str, "void") || !strcmp(str, "static")
|| !strcmp(str, "struct") || !strcmp(str, "goto"))
return (true);
return (false);
}
// Returns 'true' if the string is an INTEGER.
bool isInteger(char* str)
{
int i, len = strlen(str);
if (len == 0)
return (false);
for (i = 0; i < len; i++) {
if (str[i] != '0' && str[i] != '1' && str[i] != '2'
&& str[i] != '3' && str[i] != '4' && str[i] != '5'
&& str[i] != '6' && str[i] != '7' && str[i] != '8'
&& str[i] != '9' || (str[i] == '-' && i > 0))
return (false);
}
return (true);
}
// Returns 'true' if the string is a REAL NUMBER.
bool isRealNumber(char* str)
{
int i, len = strlen(str);
bool hasDecimal = false;
if (len == 0)
return (false);
for (i = 0; i < len; i++) {
if (str[i] != '0' && str[i] != '1' && str[i] != '2'
&& str[i] != '3' && str[i] != '4' && str[i] != '5'
&& str[i] != '6' && str[i] != '7' && str[i] != '8'
&& str[i] != '9' && str[i] != '.' ||
(str[i] == '-' && i > 0))
return (false);
if (str[i] == '.')
hasDecimal = true;
}
return (hasDecimal);
}
// Extracts the SUBSTRING.
char* subString(char* str, int left, int right)
{
int i;
char* subStr = (char*)malloc(sizeof(char) * (right - left + 2));
for (i = left; i <= right; i++)
subStr[i - left] = str[i];
subStr[right - left + 1] = '[=10=]';
return (subStr);
}
// Parsing the input STRING.
void parse(char* str)
{
int left = 0, right = 0;
int len = strlen(str);
while (right <= len && left <= right)
{
if (isDelimiter(str[right]) == false)
right++;
if (isDelimiter(str[right]) == true && left == right)
{
if (isOperator(str[right]) == true)
printf("'%c' IS A OPERATOR\n", str[right]);
right++;
left = right;
}
if (isDelimiter(str[right]) == true && left == right)
{
if (isDelimiter(str[right]) == true)
printf("'%c' IS A DELIMITER\n", str[right]);
right++;
left = right;
}
if (isSeperator(str[right]) == true && left == right)
{
//needed to recognize seperator to the right
if (isSeperator(str[right]) == true)
printf("'%c' IS A SEPERATOR\n", str[right]);
right++;
left = right;
//needed to recognize seperator to the left
if (isSeperator(str[right]) == true)
printf("'%c' IS A SEPERATOR\n", str[left]);
right++;
left = right;
}
else if (isDelimiter(str[right]) == true && left != right
|| (right == len && left != right)) {
char* subStr = subString(str, left, right - 1);
if (isKeyword(subStr) == true)
printf("'%s' IS A KEYWORD\n", subStr);
else if (isInteger(subStr) == true)
printf("'%s' IS AN INTEGER\n", subStr);
else if (isRealNumber(subStr) == true)
printf("'%s' IS A REAL NUMBER\n", subStr);
else if (validIdentifier(subStr) == true
&& isDelimiter(str[right - 1]) == false
&& isSeperator(str[right - 1]) == false)
printf("'%s' IS A VALID IDENTIFIER\n", subStr);
left = right;
}
}
return;
}
int main(int argc, char *argv)
{
/* declare a file pointer */
FILE *file;
char *buffer;
long numbytes;
/* open an existing file for reading */
file = fopen("Text.txt", "r");
/* quit if the file does not exist */
if (file == NULL)
return 1;
/* Get the number of bytes */
fseek(file, 0L, SEEK_END);
numbytes = ftell(file);
/* reset the file position indicator to
the beginning of the file */
fseek(file, 0L, SEEK_SET);
/* grab sufficient memory for the
buffer to hold the text */
buffer = (char*)calloc(numbytes, sizeof(char));
/* memory error */
if (buffer == NULL)
return 1;
/* copy all the text into the buffer */
fread(buffer, sizeof(char), numbytes, file);
/* confirm we have read the file by
outputing it to the console */
printf(" The file called Text.txt contains this text \n \n %s \n\n", buffer);
parse(buffer); // calling the parse function
fclose(file);
/* free the memory we used for the buffer */
free(buffer);
return 0;
}
看起来问题是您的 isDelimiter
函数没有获取所有可能的值。如果您将其更改为使用 isspace()
,它将匹配所有形式的空格。
bool isDelimiter(char ch)
{
if (isspace(ch))
return (true);
return (false);
}
例如,这是一个非常简单的状态机,可以让您了解我的意思。它可以处于两种状态之一 - INSIDE_IDENTIFIER 或 OUTSIDE_IDENTIFIER - 它会根据正在查看的字符类型在这两种状态之间切换。
#define OUTSIDE_IDENTIFIER (0)
#define INSIDE_IDENTIFIER (1)
void parse(char *str)
{
char *ch;
int state=OUTSIDE_IDENTIFIER;
char buffer[1000];
char *pos=buffer;
for(ch=str;*ch!='[=11=]';ch++)
{
switch(state)
{
case INSIDE_IDENTIFIER:
if(isOperator(*ch))
{
*pos='[=11=]';
printf("Identifier[%s]\n",buffer);
printf("Operator[%c]\n",*ch);
state=OUTSIDE_IDENTIFIER;
}
else if(isDelimiter(*ch))
{
*pos='[=11=]';
printf("Identifier[%s]\n",buffer);
printf("Delimiter[%c]\n",*ch);
state=OUTSIDE_IDENTIFIER;
}
else if(isspace(*ch))
{
*pos='[=11=]';
printf("Identifier[%s]\n",buffer);
printf("Space[%d]\n",*ch);
state=OUTSIDE_IDENTIFIER;
}
else
{
*pos=*ch;
pos++;
}
break;
case OUTSIDE_IDENTIFIER:
default:
if(isOperator(*ch))
{
printf("Operator[%c]\n",*ch);
}
else if(isDelimiter(*ch))
{
printf("Delimiter[%c]\n",*ch);
}
else if(isSeperator(*ch))
{
printf("Seperator[%c]\n",*ch);
}
else
{
state = INSIDE_IDENTIFIER;
pos=buffer;
*pos=*ch;
pos++;
}
break;
}
}
}