如何标记 shell 输入?
How to tokenize shell input?
我自己写了 shell 可以执行一个简单的命令管道,包括参数:
$ ls | wc -l
84
但是使用 awk 是行不通的:
$ ls | awk '{print }'
awk: cmd. line:1: '{print
awk: cmd. line:1: ^ invalid char ''' in expression
处理命令行的相关代码是:
int main(int argc, char *argv[]) {
char line2[BUFFER_LEN];
char linecopy[BUFFER_LEN];
char* params[100];
int argc2 = 0;
char *token;
char *less_token;
int i=0;
char *tokenstr;
char *search = " ";
int isBackground = 0;
int built_in_command = 0;
int fd[2];
/* long time;*/
int status = 0;
int max = 80;
int b;
struct passwd *pw;
const char *homedir;
struct timeval time_start;
struct timeval time_end;
sigset_t my_sig;
pid_t pid_temp;
char * pathValue;
char * path_strdup;
struct sigaction sa, osa;
char *p;
char *array[40];
char line[BUFFER_LEN];
size_t length;
int ki;
int ret;
struct sigaction less_sa;
err_setarg0(argv[argc-argc]);
pid_temp = 0; /* To please the compiler */
sa.sa_sigaction = sighandler;
sa.sa_flags = SA_SIGINFO;
sigaction(SIGINT, &sa, &osa);
less_sa.sa_handler = &handle_sigchld;
sigemptyset(&less_sa.sa_mask);
less_sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
if (sigaction(SIGCHLD, &less_sa, 0) == -1) {
perror(0);
exit(1);
}
/* get the PATH environment to find if less is installed */
pathValue = getenv("PATH");
if (! pathValue) {
printf ("'%s' is not set.\n", "PATH");
}
else {
/* printf ("'%s' is set to %s.\n", "PATH", pathValue);*/
}
path_strdup = strdup(pathValue);
less_token = strtok(path_strdup, ":");
ret = 1;
ret = findless(less_token, ret);
free(path_strdup);
while(1) {
i = 0;
printf("$ ");
fflush(stdout);
if (!fgets(line, BUFFER_LEN, stdin))
{
putchar('\n');
break;
}
if (AllWhiteSpace(line))
continue;
strncpy(line2, line, BUFFER_LEN);
strncpy(linecopy, line, BUFFER_LEN);
length = strlen(line);
if (line[length - 1] == '\n') {
line[length - 1] = '[=13=]';
}
if(strcmp(line, "exit")==0) {
break;
}
if(StartsWith(line, "cd")) {
built_in_command=1;
if(strstr(line, " ") == NULL) {
pw = getpwuid(getuid());
homedir = pw->pw_dir;
if (chdir(homedir)==-1) { /*Change to home directory*/
perror("Failed changing to homedirectory\n");
}
} else {
tokenstr = strtok(NULL, search);
if (chdir(tokenstr)==-1) {
perror("Failed changing directory\n");
}
}
}
token = strtok(line," ");
while(token!=NULL) {
params[i]=token;
token = strtok(NULL," ");
i++;
}
if(StartsWith(line, "checkEnv")) {
built_in_command=1;
checkEnv(ret);
}
if(0==built_in_command) { /*Not a built in command, so let execute it*/
params[i]=NULL;
for(i=0; i<argc2; i++) {
printf("%s\n", params[i]);
}
isBackground = 0;
for (b = 0; b<max; b++) {
if ('&'==line[b]) {
isBackground = 1;
}
}
if (isBackground == 1) { /*If backgroundprocess*/
if (pipe(fd)==-1) { /*(two new file descriptors)*/
perror("Failed creating pipe\n");
}
pid_temp = fork();
}
else if (isBackground == 0) { /*If foreground process*/
gettimeofday(&time_start, NULL);
if (1 == isSignal) { /*If using signaldetection*/
sigemptyset(&my_sig); /*empty and initialising a signal set*/
sigaddset(&my_sig, SIGCHLD); /*Adds signal to a signal set (my_sig)*/
/*http://pubs.opengroup.org/onlinepubs/7908799/xsh/sigprocmask.html*/
sigprocmask(SIG_BLOCK, &my_sig, NULL);
}
pid_temp = fork();
foreground = pid_temp; /*Set pid for foreground process*/
}
if (0<pid_temp) {
/*Parent process*/
}
else if (0>pid_temp) {
/*Error*/
}
else {
/*Child process*/
if (1 == isBackground) { /*Backgroundprocess*/
dup2(fd[STDIN_FILENO], STDIN_FILENO);
close(fd[0]);
close(fd[1]);
}
length = strlen(linecopy);
if (linecopy[length - 1] == '\n')
linecopy[length - 1] = '[=13=]';
/*printf("Command line: %s\n", linecopy);*/
ki = 1;
p = strtok(linecopy, " ");
array[0] = NULL;
while (p != NULL)
{
array[ki++] = p;
p = strtok(NULL, " ");
}
array[ki] = NULL;
/*dump_argv("Before exec_arguments", ki, array);*/
exec_arguments(ki, array);
corpse_collector();
}
if (0 == isBackground) { /*Foregroundprocess*/
waitpid(foreground, &status, 0); /*Waiting*/
/*Foregroundprocess terminated*/
gettimeofday(&time_end, NULL);
/* time = (time_end.tv_sec - time_start.tv_sec) * 1000000 +
time_end.tv_usec - time_start.tv_usec;*/
/*printf("Execution time %ld.%03ld ms\n", time / 1000, time % 1000);*/
if (1 == isSignal) { /*If using signaldetection*/
int a = sigprocmask(SIG_UNBLOCK, &my_sig, NULL);
/*http://man7.org/linux/man-pages/man2/sigprocmask.2.html*/
if (0 == a) {
/*Sigprocmask was successfull*/
}
else {
/*Sigprocmask was not successfull, return=-1*/
}
Janitor(SIGCHLD);
}
}
else if (1==isBackground) {
close(fd[0]);
close(fd[1]);
}
}
built_in_command = 0; /*Reset*/
memset(line, 0, sizeof line); /*Reset*/
}
return (0);
}
整个程序可用 here。
如何让我的管道工作?当我调试它时,它看起来像这样:
$ ./a.out
$ ls | wc -l
Before exec_arguments: (5) {(null)} {ls} {|} {wc} {-l}
84
27363: child 27364 status 0x0000
$ ls | awk '{print }'
Before exec_arguments: (6) {(null)} {ls} {|} {awk} {'{print} {}'}
awk: cmd. line:1: '{print
awk: cmd. line:1: ^ invalid char ''' in expression
27374: child 27375 status 0x0100
$
"Solution"
我的"solution"是有awk的时候强制循环。也许它并不理想,但它会在管道中启用 awk:
$ ls | awk '{print }'
alias.h
a.out
Boot1.asm
Boot1.bin
boot.asm
boot.bin
bootl.asm
bootload.asm
bootload.bin
bootloader
bootloader.asm
我执行标记化的新代码是
token = strtok(input, " ");
i = 1;
j=1;
params[0] = NULL;
while (token != NULL)
{
if(awk == 1) {
s = concat("awk ", token);
printf("s is %s", s);
params[i++] = token;
token = strtok(NULL, " ");
awk = 0;
continue;
}
if (strcmp(token, "awk") == 0) {
params[i++] = token;
awk = 1;
token = strtok(NULL, "\'");
continue;
}
params[i++] = token;
token = strtok(NULL, " ");
}
params[i] = NULL;
printf("ki %d", i);
/*dump_argv("Before exec_arguments", i, params);*/
exec_arguments(i, params);
corpse_collector();
free(input);
更新
根据tripleee的回答,我可以使用那个伪代码来实现解析和去除引号。这是我目前所了解的,它可以编译并适用于某些输入。如果我认为 push
意味着堆栈操作,我希望我不会误解,所以我为 char *
添加了一个堆栈到我的项目中,这似乎有效。
int handleToken(char input[BUFFER_LEN], char *token, char *params[100], int i) {
int state = 0;
char separator = ' ';
int end_quote = 0;
char dest[BUFFER_LEN];
char *ptr;
int pos = 0;
char *ptr2;
while (token != NULL) {
if (state == 0) {
if (1 == StartsWith(token, "'")) {
state = 1;
separator = '\'';
ptr2 = strstr (input,token);
if (ptr2 != NULL)
{
pos = ptr2 - input;
}
if (subString (input, pos+1, strlen (input)-pos-2, dest)) {
params[i++] = dest;
token = strtok(NULL, &separator);
continue;
}
}
if (1 == StartsWith(token, "\"")) {
state = 2;
separator = '\"';
continue;
}
params[i++] = token;
token = strtok(NULL, &separator);
}
else if (state == 1) {
ptr = strchr(token, '\'');
if (ptr) {
end_quote = ptr - token;
}
push(token);
params[i++] = token;
token = strtok(NULL, &separator);
printf("%d", end_quote);
state = 0;
}
}
params[i] = NULL;
return i;
}
/* double-quoted is similar but more complex */
测试
$ echo 'foo bar'
Before exec_arguments: (3) {(null)} {echo} {foo bar}
foo bar
2901: child 2922 status 0x0000
Execution time 1.872 ms
但这还行不通:
$ echo 'a b' | awk '{print }'
Before exec_arguments: (3) {(null)} {echo} {a b' | awk '{print }}
a b' | awk '{print }
2901: child 2993 status 0x0000
Execution time 0.734 ms
我认为当你将它作为参数传递给 execvp 时,它与转义引号有关。
写了一个小的 pgm 来检查将引用参数传递给 execvp 的机制例如代码有以下布局
char *cmd2[] = { "/usr/bin/awk", " \' { print } \'", 0 };
excvp (cmd2[0], cmd) ;
抛出的错误与您获得的错误相似
awk: ^ invalid char ''' in expression
转义 '(如 \') 或不转义对输出的结果没有任何影响
但是当上面的改为(单引号改为转义双引号)
char *cmd2[] = { "/usr/bin/awk", " \" { print } \"", 0 };
excvp (cmd2[0], cmd) ;
一切顺利。 (以上命令将回显您在 shell 中键入的任何内容)
所以我猜你需要解析 awk 的参数并寻找单引号,创建一个新的 cmd 字符串,其中这些引号被替换为 \"。猜猜你的错误即将到来,因为您正在传递用户输入的参数 "as is" 而没有格式化(如上所述)
希望对您有所帮助
您的 shell 应该在解析后去除引号。脚本周围的引号不是 Awk 语言的一部分;它们的目的是保护 Awk 脚本不被 shell 以任何方式解析。正确的最终结果是
char *cmd[] = { "/usr/bin/awk", "{ print }", 0 };
shell 的完整解析器需要处理递归结构,但带引号的字符串只需要对您的代码进行少量修改。基本上,在伪代码中
while token:
if state == regular:
if token.startswith("'"):
state := single_quoted_string
redo
elsif token.startswith("\""):
state := double_quoted_string
redo
# else
push parsed, token
token := next_token
elsif state == single_quoted_string:
end_quote := indexof("'")
push parsed, substr(token+1, end_quote-1) # omit quotes
token := end_quote + 1
state := regular
else:
# double-quoted is similar but more complex
我自己写了 shell 可以执行一个简单的命令管道,包括参数:
$ ls | wc -l
84
但是使用 awk 是行不通的:
$ ls | awk '{print }'
awk: cmd. line:1: '{print
awk: cmd. line:1: ^ invalid char ''' in expression
处理命令行的相关代码是:
int main(int argc, char *argv[]) {
char line2[BUFFER_LEN];
char linecopy[BUFFER_LEN];
char* params[100];
int argc2 = 0;
char *token;
char *less_token;
int i=0;
char *tokenstr;
char *search = " ";
int isBackground = 0;
int built_in_command = 0;
int fd[2];
/* long time;*/
int status = 0;
int max = 80;
int b;
struct passwd *pw;
const char *homedir;
struct timeval time_start;
struct timeval time_end;
sigset_t my_sig;
pid_t pid_temp;
char * pathValue;
char * path_strdup;
struct sigaction sa, osa;
char *p;
char *array[40];
char line[BUFFER_LEN];
size_t length;
int ki;
int ret;
struct sigaction less_sa;
err_setarg0(argv[argc-argc]);
pid_temp = 0; /* To please the compiler */
sa.sa_sigaction = sighandler;
sa.sa_flags = SA_SIGINFO;
sigaction(SIGINT, &sa, &osa);
less_sa.sa_handler = &handle_sigchld;
sigemptyset(&less_sa.sa_mask);
less_sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
if (sigaction(SIGCHLD, &less_sa, 0) == -1) {
perror(0);
exit(1);
}
/* get the PATH environment to find if less is installed */
pathValue = getenv("PATH");
if (! pathValue) {
printf ("'%s' is not set.\n", "PATH");
}
else {
/* printf ("'%s' is set to %s.\n", "PATH", pathValue);*/
}
path_strdup = strdup(pathValue);
less_token = strtok(path_strdup, ":");
ret = 1;
ret = findless(less_token, ret);
free(path_strdup);
while(1) {
i = 0;
printf("$ ");
fflush(stdout);
if (!fgets(line, BUFFER_LEN, stdin))
{
putchar('\n');
break;
}
if (AllWhiteSpace(line))
continue;
strncpy(line2, line, BUFFER_LEN);
strncpy(linecopy, line, BUFFER_LEN);
length = strlen(line);
if (line[length - 1] == '\n') {
line[length - 1] = '[=13=]';
}
if(strcmp(line, "exit")==0) {
break;
}
if(StartsWith(line, "cd")) {
built_in_command=1;
if(strstr(line, " ") == NULL) {
pw = getpwuid(getuid());
homedir = pw->pw_dir;
if (chdir(homedir)==-1) { /*Change to home directory*/
perror("Failed changing to homedirectory\n");
}
} else {
tokenstr = strtok(NULL, search);
if (chdir(tokenstr)==-1) {
perror("Failed changing directory\n");
}
}
}
token = strtok(line," ");
while(token!=NULL) {
params[i]=token;
token = strtok(NULL," ");
i++;
}
if(StartsWith(line, "checkEnv")) {
built_in_command=1;
checkEnv(ret);
}
if(0==built_in_command) { /*Not a built in command, so let execute it*/
params[i]=NULL;
for(i=0; i<argc2; i++) {
printf("%s\n", params[i]);
}
isBackground = 0;
for (b = 0; b<max; b++) {
if ('&'==line[b]) {
isBackground = 1;
}
}
if (isBackground == 1) { /*If backgroundprocess*/
if (pipe(fd)==-1) { /*(two new file descriptors)*/
perror("Failed creating pipe\n");
}
pid_temp = fork();
}
else if (isBackground == 0) { /*If foreground process*/
gettimeofday(&time_start, NULL);
if (1 == isSignal) { /*If using signaldetection*/
sigemptyset(&my_sig); /*empty and initialising a signal set*/
sigaddset(&my_sig, SIGCHLD); /*Adds signal to a signal set (my_sig)*/
/*http://pubs.opengroup.org/onlinepubs/7908799/xsh/sigprocmask.html*/
sigprocmask(SIG_BLOCK, &my_sig, NULL);
}
pid_temp = fork();
foreground = pid_temp; /*Set pid for foreground process*/
}
if (0<pid_temp) {
/*Parent process*/
}
else if (0>pid_temp) {
/*Error*/
}
else {
/*Child process*/
if (1 == isBackground) { /*Backgroundprocess*/
dup2(fd[STDIN_FILENO], STDIN_FILENO);
close(fd[0]);
close(fd[1]);
}
length = strlen(linecopy);
if (linecopy[length - 1] == '\n')
linecopy[length - 1] = '[=13=]';
/*printf("Command line: %s\n", linecopy);*/
ki = 1;
p = strtok(linecopy, " ");
array[0] = NULL;
while (p != NULL)
{
array[ki++] = p;
p = strtok(NULL, " ");
}
array[ki] = NULL;
/*dump_argv("Before exec_arguments", ki, array);*/
exec_arguments(ki, array);
corpse_collector();
}
if (0 == isBackground) { /*Foregroundprocess*/
waitpid(foreground, &status, 0); /*Waiting*/
/*Foregroundprocess terminated*/
gettimeofday(&time_end, NULL);
/* time = (time_end.tv_sec - time_start.tv_sec) * 1000000 +
time_end.tv_usec - time_start.tv_usec;*/
/*printf("Execution time %ld.%03ld ms\n", time / 1000, time % 1000);*/
if (1 == isSignal) { /*If using signaldetection*/
int a = sigprocmask(SIG_UNBLOCK, &my_sig, NULL);
/*http://man7.org/linux/man-pages/man2/sigprocmask.2.html*/
if (0 == a) {
/*Sigprocmask was successfull*/
}
else {
/*Sigprocmask was not successfull, return=-1*/
}
Janitor(SIGCHLD);
}
}
else if (1==isBackground) {
close(fd[0]);
close(fd[1]);
}
}
built_in_command = 0; /*Reset*/
memset(line, 0, sizeof line); /*Reset*/
}
return (0);
}
整个程序可用 here。
如何让我的管道工作?当我调试它时,它看起来像这样:
$ ./a.out
$ ls | wc -l
Before exec_arguments: (5) {(null)} {ls} {|} {wc} {-l}
84
27363: child 27364 status 0x0000
$ ls | awk '{print }'
Before exec_arguments: (6) {(null)} {ls} {|} {awk} {'{print} {}'}
awk: cmd. line:1: '{print
awk: cmd. line:1: ^ invalid char ''' in expression
27374: child 27375 status 0x0100
$
"Solution"
我的"solution"是有awk的时候强制循环。也许它并不理想,但它会在管道中启用 awk:
$ ls | awk '{print }'
alias.h
a.out
Boot1.asm
Boot1.bin
boot.asm
boot.bin
bootl.asm
bootload.asm
bootload.bin
bootloader
bootloader.asm
我执行标记化的新代码是
token = strtok(input, " ");
i = 1;
j=1;
params[0] = NULL;
while (token != NULL)
{
if(awk == 1) {
s = concat("awk ", token);
printf("s is %s", s);
params[i++] = token;
token = strtok(NULL, " ");
awk = 0;
continue;
}
if (strcmp(token, "awk") == 0) {
params[i++] = token;
awk = 1;
token = strtok(NULL, "\'");
continue;
}
params[i++] = token;
token = strtok(NULL, " ");
}
params[i] = NULL;
printf("ki %d", i);
/*dump_argv("Before exec_arguments", i, params);*/
exec_arguments(i, params);
corpse_collector();
free(input);
更新
根据tripleee的回答,我可以使用那个伪代码来实现解析和去除引号。这是我目前所了解的,它可以编译并适用于某些输入。如果我认为 push
意味着堆栈操作,我希望我不会误解,所以我为 char *
添加了一个堆栈到我的项目中,这似乎有效。
int handleToken(char input[BUFFER_LEN], char *token, char *params[100], int i) {
int state = 0;
char separator = ' ';
int end_quote = 0;
char dest[BUFFER_LEN];
char *ptr;
int pos = 0;
char *ptr2;
while (token != NULL) {
if (state == 0) {
if (1 == StartsWith(token, "'")) {
state = 1;
separator = '\'';
ptr2 = strstr (input,token);
if (ptr2 != NULL)
{
pos = ptr2 - input;
}
if (subString (input, pos+1, strlen (input)-pos-2, dest)) {
params[i++] = dest;
token = strtok(NULL, &separator);
continue;
}
}
if (1 == StartsWith(token, "\"")) {
state = 2;
separator = '\"';
continue;
}
params[i++] = token;
token = strtok(NULL, &separator);
}
else if (state == 1) {
ptr = strchr(token, '\'');
if (ptr) {
end_quote = ptr - token;
}
push(token);
params[i++] = token;
token = strtok(NULL, &separator);
printf("%d", end_quote);
state = 0;
}
}
params[i] = NULL;
return i;
}
/* double-quoted is similar but more complex */
测试
$ echo 'foo bar'
Before exec_arguments: (3) {(null)} {echo} {foo bar}
foo bar
2901: child 2922 status 0x0000
Execution time 1.872 ms
但这还行不通:
$ echo 'a b' | awk '{print }'
Before exec_arguments: (3) {(null)} {echo} {a b' | awk '{print }}
a b' | awk '{print }
2901: child 2993 status 0x0000
Execution time 0.734 ms
我认为当你将它作为参数传递给 execvp 时,它与转义引号有关。
写了一个小的 pgm 来检查将引用参数传递给 execvp 的机制例如代码有以下布局
char *cmd2[] = { "/usr/bin/awk", " \' { print } \'", 0 };
excvp (cmd2[0], cmd) ;
抛出的错误与您获得的错误相似
awk: ^ invalid char ''' in expression
转义 '(如 \') 或不转义对输出的结果没有任何影响
但是当上面的改为(单引号改为转义双引号)
char *cmd2[] = { "/usr/bin/awk", " \" { print } \"", 0 };
excvp (cmd2[0], cmd) ;
一切顺利。 (以上命令将回显您在 shell 中键入的任何内容)
所以我猜你需要解析 awk 的参数并寻找单引号,创建一个新的 cmd 字符串,其中这些引号被替换为 \"。猜猜你的错误即将到来,因为您正在传递用户输入的参数 "as is" 而没有格式化(如上所述)
希望对您有所帮助
您的 shell 应该在解析后去除引号。脚本周围的引号不是 Awk 语言的一部分;它们的目的是保护 Awk 脚本不被 shell 以任何方式解析。正确的最终结果是
char *cmd[] = { "/usr/bin/awk", "{ print }", 0 };
shell 的完整解析器需要处理递归结构,但带引号的字符串只需要对您的代码进行少量修改。基本上,在伪代码中
while token:
if state == regular:
if token.startswith("'"):
state := single_quoted_string
redo
elsif token.startswith("\""):
state := double_quoted_string
redo
# else
push parsed, token
token := next_token
elsif state == single_quoted_string:
end_quote := indexof("'")
push parsed, substr(token+1, end_quote-1) # omit quotes
token := end_quote + 1
state := regular
else:
# double-quoted is similar but more complex