如何标记 shell 输入?

How to tokenize shell input?

我自己写了 shell 可以执行一个简单的命令管道,包括参数:

$ ls | wc -l
84

但是使用 awk 是行不通的:

$ ls | awk '{print }'
awk: cmd. line:1: '{print
awk: cmd. line:1: ^ invalid char ''' in expression

处理命令行的相关代码是:

int main(int argc, char *argv[]) {

    char line2[BUFFER_LEN];
    char linecopy[BUFFER_LEN];
    char* params[100];
    int argc2 = 0;
    char *token;
    char *less_token;
    int i=0;
    char *tokenstr;
    char *search = " ";
    int isBackground = 0;
    int built_in_command = 0;
    int fd[2];
/*    long time;*/
    int status = 0;
    int max = 80;
    int b;
    struct passwd *pw;
    const char *homedir;
    struct timeval time_start;
    struct timeval time_end;
    sigset_t my_sig;
    pid_t pid_temp;
    char * pathValue;
    char * path_strdup;
    struct sigaction sa, osa;
    char *p;
    char *array[40];
    char line[BUFFER_LEN];
    size_t length;
    int ki;
    int ret;
    struct sigaction less_sa;
    err_setarg0(argv[argc-argc]);
    pid_temp = 0; /* To please the compiler */

    sa.sa_sigaction = sighandler;
    sa.sa_flags = SA_SIGINFO;
    sigaction(SIGINT, &sa, &osa);

    less_sa.sa_handler = &handle_sigchld;
    sigemptyset(&less_sa.sa_mask);
    less_sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
    if (sigaction(SIGCHLD, &less_sa, 0) == -1) {
        perror(0);
        exit(1);
    }

    /* get the PATH environment to find if less is installed */
    pathValue = getenv("PATH");
    if (! pathValue) {
        printf ("'%s' is not set.\n", "PATH");
    }
    else {
       /* printf ("'%s' is set to %s.\n", "PATH", pathValue);*/
    }
    path_strdup = strdup(pathValue);
    less_token = strtok(path_strdup, ":");
    ret = 1;
    ret = findless(less_token, ret);
    free(path_strdup);
    while(1) {
        i = 0;
        printf("$ ");
        fflush(stdout);
        if (!fgets(line, BUFFER_LEN, stdin))
        {
            putchar('\n');
            break;
        }

        if (AllWhiteSpace(line))
            continue;

        strncpy(line2, line, BUFFER_LEN);
        strncpy(linecopy, line, BUFFER_LEN);

        length = strlen(line);
        if (line[length - 1] == '\n') {
            line[length - 1] = '[=13=]';
        }
        if(strcmp(line, "exit")==0) {
            break;
        }
        if(StartsWith(line, "cd")) {
            built_in_command=1;
            if(strstr(line, " ") == NULL) {
                pw = getpwuid(getuid());
                homedir = pw->pw_dir;

                if (chdir(homedir)==-1) {   /*Change to home directory*/
                    perror("Failed changing to homedirectory\n");
                }
            } else {
                tokenstr = strtok(NULL, search);
                if (chdir(tokenstr)==-1)    {
                    perror("Failed changing directory\n");
                }

            }
        }
        token = strtok(line," ");
        while(token!=NULL) {
            params[i]=token;
            token = strtok(NULL," ");
            i++;
        }
        if(StartsWith(line, "checkEnv")) {
            built_in_command=1;
            checkEnv(ret);
        }
        if(0==built_in_command) {   /*Not a built in command, so let execute it*/

            params[i]=NULL;
            for(i=0; i<argc2; i++) {
                printf("%s\n", params[i]);
            }

            isBackground = 0;

            for (b = 0; b<max; b++) {
                if ('&'==line[b])   {
                    isBackground = 1;
                }
            }
            if (isBackground == 1)  {   /*If backgroundprocess*/

                if (pipe(fd)==-1)   {  /*(two new file descriptors)*/
                    perror("Failed creating pipe\n");
                }

                pid_temp = fork();
            }
            else if (isBackground == 0) {   /*If foreground process*/
                gettimeofday(&time_start, NULL);

                if (1 == isSignal)  {   /*If using signaldetection*/
                    sigemptyset(&my_sig); /*empty and initialising a signal set*/
                    sigaddset(&my_sig, SIGCHLD);    /*Adds signal to a signal set (my_sig)*/
                    /*http://pubs.opengroup.org/onlinepubs/7908799/xsh/sigprocmask.html*/
                    sigprocmask(SIG_BLOCK, &my_sig, NULL);
                }

                pid_temp = fork();
                foreground = pid_temp;  /*Set pid for foreground process*/
            }
            if (0<pid_temp) {
                /*Parent process*/
            }
            else if (0>pid_temp)    {
                /*Error*/
            }
            else    {
                /*Child process*/
                if (1 == isBackground)  {   /*Backgroundprocess*/
                    dup2(fd[STDIN_FILENO], STDIN_FILENO);
                    close(fd[0]);
                    close(fd[1]);
                }

                length = strlen(linecopy);
                if (linecopy[length - 1] == '\n')
                    linecopy[length - 1] = '[=13=]';

                /*printf("Command line: %s\n", linecopy);*/
                ki = 1;
                p = strtok(linecopy, " ");

                array[0] = NULL;
                while (p != NULL)
                {
                    array[ki++] = p;
                    p = strtok(NULL, " ");
                }
                array[ki] = NULL;
                /*dump_argv("Before exec_arguments", ki, array);*/
                exec_arguments(ki, array);
                corpse_collector();


            }
            if (0 == isBackground) {    /*Foregroundprocess*/
                waitpid(foreground, &status, 0);    /*Waiting*/
                /*Foregroundprocess terminated*/

                gettimeofday(&time_end, NULL);
/*                time = (time_end.tv_sec  - time_start.tv_sec) * 1000000 +
                       time_end.tv_usec - time_start.tv_usec;*/
                /*printf("Execution time %ld.%03ld ms\n", time / 1000, time % 1000);*/

                if (1 == isSignal)  {   /*If using signaldetection*/
                    int a = sigprocmask(SIG_UNBLOCK, &my_sig, NULL);
                    /*http://man7.org/linux/man-pages/man2/sigprocmask.2.html*/
                    if (0 == a) {
                        /*Sigprocmask was successfull*/
                    }
                    else    {
                        /*Sigprocmask was not successfull, return=-1*/
                    }
                    Janitor(SIGCHLD);
                }
            }
            else if (1==isBackground)   {
                close(fd[0]);
                close(fd[1]);
            }
        }
        built_in_command = 0;   /*Reset*/
        memset(line, 0, sizeof line); /*Reset*/
    }
    return (0);
}

整个程序可用 here

如何让我的管道工作?当我调试它时,它看起来像这样:

$ ./a.out 
$ ls | wc -l
Before exec_arguments: (5) {(null)} {ls} {|} {wc} {-l}
84
27363: child 27364 status 0x0000
$ ls | awk '{print }'
Before exec_arguments: (6) {(null)} {ls} {|} {awk} {'{print} {}'}
awk: cmd. line:1: '{print
awk: cmd. line:1: ^ invalid char ''' in expression
27374: child 27375 status 0x0100
$ 

"Solution"

我的"solution"是有awk的时候强制循环。也许它并不理想,但它会在管道中启用 awk:

$ ls | awk '{print }'
alias.h
a.out
Boot1.asm
Boot1.bin
boot.asm
boot.bin
bootl.asm
bootload.asm
bootload.bin
bootloader
bootloader.asm

我执行标记化的新代码是

        token = strtok(input, " ");
        i = 1;
        j=1;
        params[0] = NULL;
        while (token != NULL)
        {
            if(awk == 1) {
                s = concat("awk ", token);
                printf("s is %s", s);
                params[i++] = token;
                token = strtok(NULL, " ");
                awk = 0;
                continue;

            }
            if (strcmp(token, "awk") == 0) {
                params[i++] = token;
                awk = 1;
                token = strtok(NULL, "\'");
                continue;
            }

            params[i++] = token;
            token = strtok(NULL, " ");
        }
        params[i] = NULL;
        printf("ki %d", i);
        /*dump_argv("Before exec_arguments", i, params);*/
        exec_arguments(i, params);
        corpse_collector();
        free(input);

更新

根据tripleee的回答,我可以使用那个伪代码来实现解析和去除引号。这是我目前所了解的,它可以编译并适用于某些输入。如果我认为 push 意味着堆栈操作,我希望我不会误解,所以我为 char * 添加了一个堆栈到我的项目中,这似乎有效。

int handleToken(char input[BUFFER_LEN], char *token, char *params[100], int i) {

    int state = 0;
    char separator = ' ';
    int end_quote = 0;
    char dest[BUFFER_LEN];
    char *ptr;
    int pos = 0;
    char *ptr2;

    while (token != NULL) {
        if (state == 0) {
            if (1 == StartsWith(token, "'")) {
                state = 1;
                separator = '\'';
                ptr2 = strstr (input,token);
                if (ptr2 != NULL)
                {
                    pos = ptr2 - input;
                }
                if (subString (input, pos+1,  strlen (input)-pos-2, dest))  {
                    params[i++] = dest;
                    token = strtok(NULL, &separator);
                    continue;
                }
            }
            if (1 == StartsWith(token, "\"")) {
                state = 2;
                separator = '\"';
                continue;
            }
            params[i++] = token;
            token = strtok(NULL, &separator);
        }
        else if (state == 1) {
            ptr = strchr(token, '\'');
            if (ptr) {
                end_quote = ptr - token;
            }
            push(token);
            params[i++] = token;
            token = strtok(NULL, &separator);
            printf("%d", end_quote);
            state = 0;
        }
    }
    params[i] = NULL;
    return i;
}
/* double-quoted is similar but more complex */

测试

$ echo 'foo bar'
Before exec_arguments: (3) {(null)} {echo} {foo bar}
foo bar
2901: child 2922 status 0x0000
Execution time 1.872 ms

但这还行不通:

$ echo 'a b' | awk '{print }'
Before exec_arguments: (3) {(null)} {echo} {a b' | awk '{print }}
a b' | awk '{print }
2901: child 2993 status 0x0000
Execution time 0.734 ms

我认为当你将它作为参数传递给 execvp 时,它与转义引号有关。

写了一个小的 pgm 来检查将引用参数传递给 execvp 的机制例如代码有以下布局

  char *cmd2[] = { "/usr/bin/awk", " \' { print  } \'",  0 };
  excvp (cmd2[0], cmd) ;

抛出的错误与您获得的错误相似

 awk:  ^ invalid char ''' in expression

转义 '(如 \') 或不转义对输出的结果没有任何影响

但是当上面的改为(单引号改为转义双引号)

char *cmd2[] = { "/usr/bin/awk", " \" { print  } \"",  0 };
excvp (cmd2[0], cmd) ;

一切顺利。 (以上命令将回显您在 shell 中键入的任何内容)

所以我猜你需要解析 awk 的参数并寻找单引号,创建一个新的 cmd 字符串,其中这些引号被替换为 \"。猜猜你的错误即将到来,因为您正在传递用户输入的参数 "as is" 而没有格式化(如上所述)

希望对您有所帮助

您的 shell 应该在解析后去除引号。脚本周围的引号不是 Awk 语言的一部分;它们的目的是保护 Awk 脚本不被 shell 以任何方式解析。正确的最终结果是

char *cmd[] = { "/usr/bin/awk", "{ print  }", 0 };

shell 的完整解析器需要处理递归结构,但带引号的字符串只需要对您的代码进行少量修改。基本上,在伪代码中

while token:
  if state == regular:
    if token.startswith("'"):
      state := single_quoted_string
      redo
    elsif token.startswith("\""):
      state := double_quoted_string
      redo
    # else
    push parsed, token
    token := next_token
  elsif state == single_quoted_string:
    end_quote := indexof("'")
    push parsed, substr(token+1, end_quote-1) # omit quotes
    token := end_quote + 1
    state := regular
  else:
    # double-quoted is similar but more complex