如何在 boost::spirit::qi 中将某些语义动作排除在 AST 之外

Question

我尝试使用 boost::spirit::qi 解析大量文件。解析不是问题，但有些文件包含我想跳过的噪音。构建一个简单的解析器（不使用 boost::spirit::qi）验证我可以通过跳过行首不匹配规则的任何内容来避免噪音。因此，我正在寻找一种方法来编写基于行的解析器，该解析器在不匹配任何规则时跳过行。

下面的示例允许语法在完全不匹配的情况下跳过行，但是 'junk' 规则仍然插入一个空的 V() 实例，这是不希望的行为。在示例中使用 \r 而不是 \n 是有意的，因为我在文件中遇到了 \n、\r 和 \r\n。

#include <iostream>
#include <string>
#include <vector>
#include <boost/foreach.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/std_tuple.hpp>

namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phx = boost::phoenix;

using V = std::tuple<std::string, double, double, double>;

namespace client {
    template <typename Iterator>
    struct VGrammar : qi::grammar<Iterator, std::vector<V>(), ascii::space_type> {
        VGrammar() : VGrammar::base_type(start) {
            using namespace qi;

            v %= string("v") > double_ > double_ > double_;
            junk = +(char_ - eol);
            start %= +(v | junk);

            v.name("v");
            junk.name("junk");
            start.name("start");

            using phx::val;
            using phx::construct;

            on_error<fail>(
                start,
                std::cout
                    << val("Error! Expecting \n\n'")
                    << qi::_4
                    << val("'\n\n here: \n\n'")
                    << construct<std::string>(qi::_3, qi::_2)
                    << val("'")
                    << std::endl
            );

            //debug(v);
            //debug(junk);
            //debug(start);
        }

        qi::rule<Iterator> junk;
        //qi::rule<Iterator, qi::unused_type()> junk; // Doesn't work either
        //qi::rule<Iterator, qi::unused_type(), qi::unused_type()> junk; // Doesn't work either
        qi::rule<Iterator, V(), ascii::space_type> v;
        qi::rule<Iterator, std::vector<V>(), ascii::space_type> start;
    };
} // namespace client

int main(int argc, char* argv[]) {
    using iterator_type = std::string::const_iterator;

    std::string input = "";
    input += "v 1 2 3\r";         // keep v 1 2 3
    input += "o a b c\r";         // parse as junk
    input += "v 4 5 6 v 7 8 9\r"; // keep v 4 5 6, but parse v 7 8 9 as junk
    input += "   v 10 11 12\r\r"; // parse as junk

    iterator_type iter = input.begin();
    const iterator_type end = input.end();
    std::vector<V> parsed_output;
    client::VGrammar<iterator_type> v_grammar;

    std::cout << "run" << std::endl;
    bool r = phrase_parse(iter, end, v_grammar, ascii::space, parsed_output);
    std::cout << "done ... r: " << (r ? "true" : "false") << ", iter==end: " << ((iter == end) ? "true" : "false") << std::endl;

    if (r && (iter == end)) {
        BOOST_FOREACH(V const& v_row, parsed_output) {
            std::cout << std::get<0>(v_row) << ", " << std::get<1>(v_row) << ", " << std::get<2>(v_row) << ", " << std::get<3>(v_row) << std::endl;
        }
    }

    return EXIT_SUCCESS;
}

这是示例的输出：

run
done ... r: true, iter==end: true
v, 1, 2, 3
, 0, 0, 0
v, 4, 5, 6
v, 7, 8, 9
v, 10, 11, 12

这就是我真正希望解析器return。

run
done ... r: true, iter==end: true
v, 1, 2, 3
v, 4, 5, 6

我现在的主要问题是防止 'junk' 规则添加一个空的 V() 对象。我该如何做到这一点？还是我想多了？

我已经尝试将 lit(junk) 添加到开始规则中，因为 lit() 不会 return 任何东西，但这不会编译。它失败了："static assertion failed: error_invalid_expression".

我也曾尝试将垃圾规则的语义操作设置为 qi::unused_type() 但在这种情况下该规则仍然会创建一个空的 V()。

我知道以下问题，但它们没有解决这个特定问题。我之前已经尝试过 comment skipper，但看起来我必须重新实现 skipper 中的所有解析规则才能识别噪音。我的例子灵感来自于上一个link:

中的解决方案

How to skip line/block/nested-block comments in Boost.Spirit?

How to parse entries followed by semicolon or newline (boost::spirit)?

版本信息：

Linux debian 4.9.0-7-amd64 #1 SMP Debian 4.9.110-3+deb9u2 (2018-08-13) x86_64 GNU/Linux
g++ (Debian 6.3.0-18+deb9u1) 6.3.0 20170516
#define BOOST_VERSION 106200

和：

Linux raspberrypi 4.14.24-v7+ #1097 SMP Mon Mar 5 16:42:05 GMT 2018 armv7l GNU/Linux
g++ (Raspbian 4.9.2-10+deb8u1) 4.9.2
#define BOOST_VERSION 106200

对于那些想知道的人：是的，我正在尝试解析类似于 Wavefront OBJ 文件的文件，并且我知道已经有很多可用的解析器。然而，我正在解析的数据是一个更大的数据结构的一部分，它也需要解析，所以构建一个新的解析器确实有意义。

Answer 1

你想要实现的是错误恢复。

不幸的是，Spirit 没有很好的方法来做到这一点（也有一些内部决策很难在外部做出）。但是，在您的情况下，通过语法重写很容易实现。

#include <iostream>
#include <string>
#include <vector>
#include <boost/foreach.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/std_tuple.hpp>

namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phx = boost::phoenix;

using V = std::tuple<std::string, double, double, double>;

namespace client {
    template <typename Iterator>
    struct VGrammar : qi::grammar<Iterator, std::vector<V>()> {
        VGrammar() : VGrammar::base_type(start) {
            using namespace qi;

            v = skip(blank)[no_skip[string("v")] > double_ > double_ > double_];
            junk = +(char_ - eol);
            start = (v || -junk) % eol;

            v.name("v");
            junk.name("junk");
            start.name("start");

            using phx::val;
            using phx::construct;

            on_error<fail>(
                start,
                std::cout
                << val("Error! Expecting \n\n'")
                << qi::_4
                << val("'\n\n here: \n\n'")
                << construct<std::string>(qi::_3, qi::_2)
                << val("'")
                << std::endl
                );

            //debug(v);
            //debug(junk);
            //debug(start);
        }

        qi::rule<Iterator> junk;
        //qi::rule<Iterator, qi::unused_type()> junk; // Doesn't work either
        //qi::rule<Iterator, qi::unused_type(), qi::unused_type()> junk; // Doesn't work either
        qi::rule<Iterator, V()> v;
        qi::rule<Iterator, std::vector<V>()> start;
    };
} // namespace client

int main(int argc, char* argv[]) {
    using iterator_type = std::string::const_iterator;

    std::string input = "";
    input += "v 1 2 3\r";         // keep v 1 2 3
    input += "o a b c\r";         // parse as junk
    input += "v 4 5 6 v 7 8 9\r"; // keep v 4 5 6, but parse v 7 8 9 as junk
    input += "   v 10 11 12\r\r"; // parse as junk

    iterator_type iter = input.begin();
    const iterator_type end = input.end();
    std::vector<V> parsed_output;
    client::VGrammar<iterator_type> v_grammar;

    std::cout << "run" << std::endl;
    bool r = parse(iter, end, v_grammar, parsed_output);
    std::cout << "done ... r: " << (r ? "true" : "false") << ", iter==end: " << ((iter == end) ? "true" : "false") << std::endl;

    if (r && (iter == end)) {
        BOOST_FOREACH(V const& v_row, parsed_output) {
            std::cout << std::get<0>(v_row) << ", " << std::get<1>(v_row) << ", " << std::get<2>(v_row) << ", " << std::get<3>(v_row) << std::endl;
        }
    }

    return EXIT_SUCCESS;
}

Answer 2

I have tried adding lit(junk) to the start rule, since lit() doesn't return anything, but this will not compile. It fails with: "static assertion failed: error_invalid_expression".

您正在寻找的是 omit[junk]，但它应该没有什么区别，因为它仍然会生成合成属性 optional<>。

修理东西

首先，你需要换行有意义。其中 意味着 你不能跳过 space。因为它吃换行符。更糟糕的是，您还需要前导空格也很重要（例如，垃圾最后一行）。那时你甚至不能使用 qi::blank 作为船长。（参见 Boost spirit skipper issues）。

这样你仍然可以在 v 规则中有空格，只需要一个本地船长（不吃换行符）：
```
v %= &lit("v") >> skip(blank) [ string("v") > double_ > double_ > double_ ];
```
只有在确定没有意外的前导空格后，它才会与船长互动。

请注意，string("v") 这种方式有点多余，但这将我们引向第二个动机：
其次，我和你一起 avoiding semantic actions。然而，这意味着您必须使您的规则反映您的数据结构。

在这种特殊情况下，这意味着您可能应该将跳过的行从里到外翻转一下。如果您将语法表达为 v 的直接重复，中间穿插着 /whatever/，而不仅仅是 /newline/，会怎样？我会这样写：
```
junk = *(char_ - eol);
other = !v >> junk;

start = *(v >> junk >> eol % other);
```
请注意
- 分隔符表达式现在使用 operator%（列表运算符）本身：(eol % other)。这巧妙地完成的是，只要它们仅由 "other" 行分隔（此时任何 !v），它就会继续吃换行符。
- other 比 junk 更受限制，因为 junk 可能会吃掉 v，而 other 确保永远不会发生
- 因此 v >> junk 允许正确处理示例的第三行（具有 v 4 5 6 v 7 8 9\r 的行）

现在一切正常：Live On Coliru:

run
done ... r: true, iter==end: true
v, 1, 2, 3
v, 4, 5, 6

完善它

您可能已经意识到，当第一行不是 v 行时，这不会处理这种情况。让我们将该案例添加到示例中并确保它也能正常工作：

Live On Coliru:

//#define BOOST_SPIRIT_DEBUG
#include <iostream>
#include <string>
#include <vector>
#include <boost/foreach.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/std_tuple.hpp>

namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;

using V = std::tuple<std::string, double, double, double>;

namespace client {
    template <typename Iterator>
    struct VGrammar : qi::grammar<Iterator, std::vector<V>()> {
        VGrammar() : VGrammar::base_type(start) {
            using namespace qi;

            v %= &lit("v") >> skip(blank) [ string("v") > double_ > double_ > double_ ];
            junk = *(char_ - eol);
            other = !v >> junk;

            start = 
                other >> eol % other >>
                *(v >> junk >> eol % other);

            BOOST_SPIRIT_DEBUG_NODES((v)(junk)(start))

            on_error<fail>(
                start,
                std::cout
                    << phx::val("Error! Expecting \n\n'") << qi::_4
                    << "'\n\n here: \n\n'" << phx::construct<std::string>(qi::_3, qi::_2)
                    << "'\n"
            );
        }

      private:
        qi::rule<Iterator> other, junk;
        qi::rule<Iterator, V()> v;
        qi::rule<Iterator, std::vector<V>()> start;
    };
} // namespace client

int main() {
    using iterator_type = std::string::const_iterator;

    std::string input = "";
    input += "o a b c\r";         // parse as junk
    input += "v 1 2 3\r";         // keep v 1 2 3
    input += "o a b c\r";         // parse as junk
    input += "v 4 5 6 v 7 8 9\r"; // keep v 4 5 6, but parse v 7 8 9 as junk
    input += "   v 10 11 12\r\r"; // parse as junk

    iterator_type iter = input.begin();
    const iterator_type end = input.end();
    std::vector<V> parsed_output;
    client::VGrammar<iterator_type> v_grammar;

    std::cout << "run" << std::endl;
    bool r = parse(iter, end, v_grammar, parsed_output);
    std::cout << "done ... r: " << (r ? "true" : "false") << ", iter==end: " << ((iter == end) ? "true" : "false") << std::endl;

    if (iter != end)
        std::cout << "Remaining unparsed: '" << std::string(iter, end) << "'\n";

    if (r) {
        BOOST_FOREACH(V const& v_row, parsed_output) {
            std::cout << std::get<0>(v_row) << ", " << std::get<1>(v_row) << ", " << std::get<2>(v_row) << ", " << std::get<3>(v_row) << std::endl;
        }
    }

    return EXIT_SUCCESS;
}

如何在 boost::spirit::qi 中将某些语义动作排除在 AST 之外

How can I keep certain semantic actions out of the AST in boost::spirit::qi

c++

boost-spirit

boost-spirit-qi

修理东西

完善它