更新解析器以在带引号的字符串中接受括号

Update a parser to admit parentheses within quoted strings

我需要更新解析器以接受这些新功能,但我无法一次管理所有这些功能:

(看源代码示例更容易理解这些要求)

我目前的代码,包括检查,如下:

神马 link: https://godbolt.org/z/5d6o53n9h

#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>

namespace script
{
    struct Command
    {
        enum Type { NONE, WRITE_LOG, INSERT_LABEL, START_PROCESS, END_PROCESS, COMMENT, FAIL };

        Type type{ Type::NONE };
        std::vector<std::string> args;
    };

    using Commands = std::vector<Command>;
}//namespace script

BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)

namespace script
{
    namespace qi = boost::spirit::qi;

    template <typename It>
    class Parser : public qi::grammar<It, Commands()>
    {
    private:
        qi::symbols<char, Command::Type> type;
        qi::rule<It, Command(), qi::blank_type> none, command, comment, fail;//By its very nature "fail" must be the last one to be checked
        qi::rule<It, Commands()> start;

    public:
        Parser() : Parser::base_type(start)
        {
            using namespace qi;//NOTE: "as_string" is neccessary in all args due to std::vector<std::string>
            auto empty_args = copy(attr(std::vector<std::string>{}));

            type.add
                ("WriteLog", Command::WRITE_LOG)
                ("InsertLabel", Command::INSERT_LABEL)
                ("StartProcess", Command::START_PROCESS)
                ("EndProcess", Command::END_PROCESS);

            none = omit[*blank] >> &(eol | eoi)
                >> attr(Command::NONE)
                >> empty_args;//ignore args

            command = type >> '('
                >> as_string[lexeme[+~char_("(),\r\n")]] % ',' >> ')';

            comment = lit("//")
                >> attr(Command::COMMENT)
                >> as_string[lexeme[*~char_("\r\n")]];

            fail = omit[*~char_("\r\n")]
                >> attr(Command::FAIL)
                >> empty_args;//ignore args

            start = skip(blank)[(none | command | comment | fail) % eol] >> eoi;
        }
    };

    Commands parse(std::istream& in)
    {
        using It = boost::spirit::istream_iterator;
        static const Parser<It> parser;

        Commands commands;

        It first(in >> std::noskipws), last;//No white space skipping
        if (!qi::parse(first, last, parser, commands))
            throw std::runtime_error("command parse error");

        return commands;
    }
}//namespace script

std::stringstream ss{
R"(// just a comment

WriteLog("this is a log")
WriteLog("this is also (in another way) a log")
WriteLog("but this is just a fail)

StartProcess(17, "program.exe", True)
StartProcess(17, "this_is_a_fail.exe, True)
)"};

int main()
{
    using namespace script;

    try
    {
        auto commands = script::parse(ss);

        std::array args{ 0, 0, 1, 1, -1, 0, 3, -1, 0 };//Fails may have any number of arguments. It doesn't care. Sets as -1 by convenience flag
        std::array types{ Command::COMMENT, Command::NONE, Command::WRITE_LOG, Command::WRITE_LOG, Command::FAIL, Command::NONE, Command::START_PROCESS, Command::FAIL, Command::NONE };
        std::cout << std::boolalpha << "size correct? " << (commands.size() == 9) << std::endl;
        std::cout << "types correct? " << std::equal(commands.begin(), commands.end(), types.begin(), types.end(), [](auto& cmd, auto& type) { return cmd.type == type; }) << std::endl;
        std::cout << "arguments correct? " << std::equal(commands.begin(), commands.end(), args.begin(), args.end(), [](auto& cmd, auto arg) { return cmd.args.size() == arg || arg == -1; }) << std::endl;
    }
    catch (std::exception const& e)
    {
        std::cout << e.what() << "\n";
    }
}

如有任何帮助,我们将不胜感激。

你说你想在引用的字符串中使用括号。但是你甚至不支持带引号的字符串!

所以问题出在你的参数规则上。这甚至不存在。大概是这部分:

argument = +~char_("(),\r\n");
command = type >> '(' >> argument % ',' >> ')';

其中 argument 可能声明为

qi::rule<It, Argument()> argument;

事实上,以有组织的方式重写测试,这就是我们现在得到的:

Live On Compiler Explorer

static const Commands expected{
    {Command::COMMENT, {"just a comment"}},
    {Command::NONE, {}},
    {Command::WRITE_LOG, {"this is a log"}},
    {Command::WRITE_LOG, {"this is also (in another way) a log"}},
    {Command::FAIL, {}},
    {Command::NONE, {}},
    {Command::START_PROCESS, {"17", "program.exe", "True"}},
    {Command::FAIL, {}},
    {Command::NONE, {}},
};

try {
    auto parsed = script::parse(ss);
    fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
               (parsed == expected), parsed.size(), expected.size());

    for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
        if (expected[i] != parsed[i]) {
            fmt::print("index #{} expected {}\n"
                       "          actual:  {}\n",
                       i, expected[i], parsed[i]);
        } else {
            fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
        }
    }
} catch (std::exception const& e) {
    fmt::print("Exception: {}\n", e.what());
}

版画

Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 expected Command(WRITE_LOG, ["this is a log"])
          actual:  Command(WRITE_LOG, ["\"this is a log\""])
index #3 expected Command(WRITE_LOG, ["this is also (in another way) a log"])
          actual:  Command(FAIL, [])
index #4 expected Command(FAIL, [])
          actual:  Command(WRITE_LOG, ["\"but this is just a fail"])
index #5 CORRECT (Command(NONE, []))
index #6 expected Command(START_PROCESS, ["17", "program.exe", "True"])
          actual:  Command(START_PROCESS, ["17", "\"program.exe\"", "True"])
index #7 expected Command(FAIL, [])
          actual:  Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))

如您所见,在我的预期中,它也无法使用引号引起来的字符串。那是因为引用是一种语言结构。在 AST(解析结果)中,您不关心它是如何用代码编写的。例如。 "hello\ world1" 也可能等价于 "hello world!" 所以两者都应该产生参数值 hello world!.

所以,让我们按照我们说的去做:

argument = quoted_string | number | boolean | raw_string;

我们可以添加一些规则:

// notice these are lexemes (no internal skipping):
qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;

并定义它们:

quoted_string = '"' >> *~char_('"') >> '"';
number        = raw[double_];
boolean       = raw[bool_];
raw_string    = +~char_("(),\r\n");
argument      = quoted_string | number | boolean | raw_string;

(If you want to allow escaped quotes, something like this:

 quoted_string = '"' >> *('\' >> char_ | ~char_('"')) >> '"';

现在,我想说您可能希望 Argument 类似于 variant<double, std::string, bool>,而不仅仅是 std::string

仅此更改,所有问题几乎都消失了:Live On Compiler Explorer:

Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 expected Command(FAIL, [])
          actual:  Command(START_PROCESS, ["17", "this_is_a_fail.exe, True)\n\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))

现在,索引 #7 看起来 非常时髦,但它实际上是 Spirit 中的一个 well-known 现象¹。 Enabling BOOST_SPIRIT_DEBUG 演示:

  <argument>
    <try>"this_is_a_fail.exe,</try>
    <quoted_string>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </quoted_string>
    <number>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </number>
    <boolean>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </boolean>
    <raw_string>
      <try>"this_is_a_fail.exe,</try>
      <success>, True)</success>
      <attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,,  , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
    </raw_string>
    <success>, True)</success>
    <attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,,  , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
  </argument>

因此,该字符串被接受为原始字符串,即使它以 " 开头。这很容易修复,但我们甚至不需要。我们可以只应用 qi::hold 来避免重复:

argument = qi::hold[quoted_string] | number | boolean | raw_string;

结果:

actual:  Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])

但是,如果您预计它会失败,请解决其他问题:

raw_string    = +~char_("\"(),\r\n"); // note the \"

Note: In the off-chance you really only require it to not start with a quote:

raw_string    = !lit('"') >> +~char_("(),\r\n");

I guess by now you see the problem with a "loose rule" like that, so I don't recommend it.

You could express the requirement another way though, saying "if an argument starts with '"' then is MUST be a quoted_string. Use an expectation point there:

quoted_string = '"' > *('\' >> char_ | ~char_('"')) > '"';

This has the effect that failure to parse a complete quoted_string will throw an expectation_failed exception.

总结/清单

这就是我们最终得到的:

Live On Compiler Explorer

//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <fmt/ranges.h>

namespace script {
    using Argument = std::string;
    using Arguments = std::vector<Argument>;

    struct Command {
        enum Type {
            NONE,
            WRITE_LOG,
            INSERT_LABEL,
            START_PROCESS,
            END_PROCESS,
            COMMENT,
            FAIL
        };

        Type      type{Type::NONE};
        Arguments args;

        auto operator<=>(Command const&) const = default;
    };

    using Commands = std::vector<Command>;
} // namespace script

BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)

namespace script {
    namespace qi = boost::spirit::qi;

    template <typename It> class Parser : public qi::grammar<It, Commands()> {
    public:
        Parser() : Parser::base_type(start) {
            using namespace qi; // NOTE: "as_string" is neccessary in all args
            auto empty_args = copy(attr(Arguments{}));

            type.add //
                ("WriteLog",     Command::WRITE_LOG)     //
                ("InsertLabel",  Command::INSERT_LABEL)  //
                ("StartProcess", Command::START_PROCESS) //
                ("EndProcess",   Command::END_PROCESS);  //

            none = omit[*blank] >> &(eol | eoi) //
                >> attr(Command{Command::NONE, {}});

            quoted_string = '"' >> *('\' >> char_ | ~char_('"')) >> '"';
            number        = raw[double_];
            boolean       = raw[bool_];
            raw_string    = +~char_("\"(),\r\n");
            argument = qi::hold[quoted_string] | number | boolean | raw_string;

            command = type >> '(' >> argument % ',' >> ')';

            comment = "//"                             //
                >> attr(Command::COMMENT)              //
                >> as_string[lexeme[*~char_("\r\n")]]; //

            fail = omit[*~char_("\r\n")] >> attr(Command{Command::FAIL, {}});

            line  = none | command | comment | fail; // keep fail last
            start = skip(blank)[line % eol] >> eoi;

            BOOST_SPIRIT_DEBUG_NODES((start)(line)(fail)(comment)(command)(
                argument)(none)(quoted_string)(raw_string)(boolean)(number))
        }

    private:
        qi::symbols<char, Command::Type>         type;
        qi::rule<It, Command(), qi::blank_type>  line, none, command, comment, fail;
        // notice these are lexemes (no internal skipping):
        qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
        qi::rule<It, Commands()> start;
    };

    Commands parse(std::istream& in)
    {
        using It = boost::spirit::istream_iterator;
        static const Parser<It> parser;

        Commands commands;

        return qi::parse(It{in >> std::noskipws}, {}, parser, commands)
            ? commands
            : throw std::runtime_error("command parse error");
    }

    struct Formatter {
        static constexpr auto name(script::Command::Type type) {
            return std::array{"NONE",          "WRITE_LOG",   "INSERT_LABEL",
                            "START_PROCESS", "END_PROCESS", "COMMENT",
                            "FAIL"}
                .at(static_cast<int>(type));
        }

        auto parse(auto& ctx) const { return ctx.begin(); }
        auto format(script::Command const& cmd, auto& ctx) const {
            return format_to(ctx.out(), "Command({}, {})", name(cmd.type), cmd.args);
        }
    };
} // namespace script

template <> struct fmt::formatter<script::Command> : script::Formatter {};

std::stringstream ss{
    R"(// just a comment

    WriteLog("this is a log")
    WriteLog("this is also (in another way) a log")
    WriteLog("but this is just a fail)

    StartProcess(17, "program.exe", True)
    StartProcess(17, "this_is_a_fail.exe, True)
    )"};

int main() {
    using namespace script;
    static const Commands expected{
        {Command::COMMENT, {"just a comment"}},
        {Command::NONE, {}},
        {Command::WRITE_LOG, {"this is a log"}},
        {Command::WRITE_LOG, {"this is also (in another way) a log"}},
        {Command::FAIL, {}},
        {Command::NONE, {}},
        {Command::START_PROCESS, {"17", "program.exe", "True"}},
        {Command::FAIL, {}},
        {Command::NONE, {}},
    };

    try {
        auto parsed = script::parse(ss);
        fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
                (parsed == expected), parsed.size(), expected.size());

        for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
            if (expected[i] != parsed[i]) {
                fmt::print("index #{} expected {}\n"
                        "          actual:  {}\n",
                        i, expected[i], parsed[i]);
            } else {
                fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
            }
        }
    } catch (std::exception const& e) {
        fmt::print("Exception: {}\n", e.what());
    }
}

版画

Parsed all correct? true -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 CORRECT (Command(FAIL, []))
index #8 CORRECT (Command(NONE, []))

¹ 参见示例boost::spirit alternative parsers return duplicates(链接到另外三个同类)