将 shell 引用的字符串解析为与 execv 兼容的参数向量

Parse shell quoted string into execv compatible argument vector

假设我有如下字符串

echo "foo" "bar\"blub""baz" "'" "\"" foo\ bar "\" '\'' """"       Lots" "of\ whitespace

现在我想将上面的字符串作为命令执行,就好像它是通过调用 Unix.execv 输入到 shell 中一样。如果我没有犯任何错误,那么 shell 会将上面的内容解析为以下 ocaml 列表:

["echo"; "foo"; "bar\"blubbaz"; "'"; "\""; "foo bar"; "\"; "'", "", "Lots of whitespace"]

哪个库让我从原始字符串到解析后的列表?

最终我想把结果列表交给Unix.execvpe。还有 Unix.open_process_full 可以使用 /bin/sh 处理我的原始字符串,但我发现在不使用 /bin/sh 的情况下直接调用外部程序时,我的应用程序快了 16%。现在我希望能够接受更多的输入字符串,包括引号和转义符。

我必须推出自己的解析器吗?

存在 POSIX 函数 wordexp 但包装该函数并不能解决我的问题,因为 wordexp 做的比我想要的更多(命令替换、计算 glob、替换tilda 和环境变量)。

我只想解决引号和转义问题。

我使用 ocamllex 制定了解决此问题的方法。将它张贴在这里以防其他人想要做类似的事情。它应该很容易扩展,以包含超出当前支持的转义字符和其他 shell 功能范围的功能。

{
  exception UnknownShellEscape of string
  exception UnmatchedChar of char
  let buf_from_str str =
    let buf = Buffer.create 16 in
    Buffer.add_string buf str;
    buf
}

let safechars = [^ '"' ''' '\' ' ' '\t']+
let space = [ ' ' '\t' ]+

rule shell_command argv = parse
 | space         { shell_command argv lexbuf }
 | safechars     { uquote argv (buf_from_str (Lexing.lexeme lexbuf)) lexbuf }
 | '\' '"'      { uquote argv (buf_from_str "\"") lexbuf }
 | '\' '''      { uquote argv (buf_from_str "'") lexbuf }
 | '\' '\'     { uquote argv (buf_from_str "\") lexbuf }
 | '\' ' '      { uquote argv (buf_from_str " ") lexbuf }
 | '\' _ as c   { raise (UnknownShellEscape c) }
 | '"'           { dquote argv (Buffer.create 16) lexbuf }
 | '''           { squote argv (Buffer.create 16) lexbuf }
 | _ as c        { raise (UnmatchedChar c) }
 | eof { List.rev argv }
and uquote argv buf = parse
 | (space|eof) { shell_command ((Buffer.contents buf)::argv) lexbuf }
 | '\' '"'    { Buffer.add_string buf "\""; uquote argv buf lexbuf }
 | '\' '''    { Buffer.add_string buf "'"; uquote argv buf lexbuf }
 | '\' '\'   { Buffer.add_string buf "\"; uquote argv buf lexbuf }
 | '\' ' '    { Buffer.add_string buf " "; uquote argv buf lexbuf }
 | '\' _ as c { raise (UnknownShellEscape c) }
 | '"'         { dquote argv buf lexbuf }
 | '''         { squote argv buf lexbuf }
 | safechars   { Buffer.add_string buf (Lexing.lexeme lexbuf); uquote argv buf lexbuf }
 | _ as c      { raise (UnmatchedChar c) }
and dquote argv buf = parse
 | '"' (space|eof) { shell_command ((Buffer.contents buf)::argv) lexbuf }
 | '"' '"'         { dquote argv buf lexbuf }
 | '"' '''         { squote argv buf lexbuf }
 | '"'             { uquote argv buf lexbuf }
 | '\' '"'        { Buffer.add_string buf "\""; dquote argv buf lexbuf }
 | '\' '\'       { Buffer.add_string buf "\"; dquote argv buf lexbuf }
 | '\' _ as c     { raise (UnknownShellEscape c) }
 | [^ '"' '\' ]+  { Buffer.add_string buf (Lexing.lexeme lexbuf); dquote argv buf lexbuf }
 | _ as c          { raise (UnmatchedChar c) }
and squote argv buf = parse
 | ''' (space|eof) { shell_command ((Buffer.contents buf)::argv) lexbuf }
 | ''' '''         { squote argv buf lexbuf }
 | ''' '"'         { dquote argv buf lexbuf }
 | '''             { uquote argv buf lexbuf }
 | [^ ''' ]+       { Buffer.add_string buf (Lexing.lexeme lexbuf); squote argv buf lexbuf }
 | _ as c          { raise (UnmatchedChar c) }

{
  let main () =
    let cin =
      if Array.length Sys.argv > 1
      then open_in Sys.argv.(1)
      else stdin
    in
    let lexbuf = Lexing.from_channel cin in
    let argv = shell_command [] lexbuf in
    List.iter (Printf.printf "%s\n") argv

  let _ = Printexc.print main ()
}

试一试运行:

$ ocamllex test.mll
$ echo 'echo "foo" "bar\"blub""baz" "'\''" "\"" foo\ bar '\
> '"\\" """"'\'''\'''\'''\''""       Lots" "of\ whitespace' \
> | ocaml test.ml
echo
foo
bar"blubbaz
'
"
foo bar
\

Lots of whitespace

成功! \o/