如何在 Boost::Spirit::Lex 中使用指针作为 token 属性?

How to use a pointer as token attribute in Boost::Spirit::Lex?

我写了一个最小的例子来演示这个问题。它解析嵌套的数字列表,如 (1 2 3 (4 5) (6 (7 (8))))。我使用 spirit::lex 解析数字,使用 spirit::qi 解析列表,所以我这样编码:

using TokenTypes = boost::mpl::vector<Object*>;
using Iterator = std::string::iterator;

class Lexer : public lex::lexer<actor_lexer<token<Iterator, TokenTypes>>>
{
public:
  lex::token_def<> spaces;        // used to skip spaces
  lex::token_def<Object*> number; // create Number Object on heap and use the pointer as attribute

public:
  Lexer();
};

template<typename... Ts>
using Rule = qi::rule<Lexer::iterator_type, Ts...>;

class Parser : public qi::grammar<Lexer::iterator_type, Object*>
{
public:
  Lexer lexer;

  Rule<Object*> list;
  Rule<Object*> elem;

public:
  Parser();
};

但是在Parser::Parser()中,我不能在语法表达式中使用Lexer::number:

Parser::Parser()
  : base_type(elem)
{
  // list = ...

  elem %= list | lexer.number; // fail to compile!
}

Clang 错误消息(简短):

/usr/include/boost/spirit/home/qi/detail/assign_to.hpp:42:36: error: type 'Object *' cannot be used prior to '::' because it has no members
          : is_iter_range<typename C::value_type> {};
                                   ^
...
...
...

我不明白为什么这是错误的,因为当我使用其他标量类型(如 intdouble 作为标记属性时它曾经工作正常。

那么,如何使用指针类型作为token属性呢?

完整示例

#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iostream>
#include <string>
#include <vector>

class Object
{
public:
  virtual ~Object() = default;

public:
  virtual void print(std::ostream& out) = 0;
};

class Number : public Object
{
public:
  int64_t _val;

public:
  virtual void print(std::ostream& out) override { out << _val; }
};

class List : public Object
{
public:
  std::vector<Object*> _objs;

public:
  virtual void print(std::ostream& out) override
  {
    out << '(';
    for (auto&& i : _objs) {
      i->print(out);
      out << ' ';
    }
    out << ')';
  }
};

namespace qi = boost::spirit::qi;
namespace fu = boost::fusion;
namespace lex = boost::spirit::lex;

using lex::lexertl::actor_lexer;
using lex::lexertl::token;

using TokenTypes = boost::mpl::vector<Object*>;
using Iterator = std::string::iterator;

class Lexer : public lex::lexer<actor_lexer<token<Iterator, TokenTypes>>>
{
public:
  lex::token_def<> spaces;
  lex::token_def<Object*> number;

public:
  Lexer();
};

template<typename... Ts>
using Rule = qi::rule<Lexer::iterator_type, Ts...>;

class Parser : public qi::grammar<Lexer::iterator_type, Object*>
{
public:
  Lexer lexer;

  Rule<Object*, qi::locals<List*>> list;
  Rule<Object*> elem;

public:
  Parser();
};

Lexer::Lexer()
{
  self += '(';
  self += ')';

  spaces = R"(\s+)";
  self +=
    spaces[([](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
      matched = lex::pass_flags::pass_ignore;
    })];

  number = R"(\d+)";
  self +=
    number[([](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
      auto val = new Number();
      auto iter = start;
      qi::parse(iter, end, qi::long_long, val->_val);
      ctx.set_value(val);
    })];
}

Parser::Parser()
  : base_type(elem)
{
  list = (         //
    qi::lit('(')[( //
      [](auto& attr, auto& ctx, bool& pass) {
        fu::at_c<0>(ctx.locals) = new List();
      })]       //
    >> *(elem[( //
         [](auto& attr, auto& ctx, bool& pass) {
           List* list = fu::at_c<0>(ctx.locals);
           list->_objs.push_back(attr);
         })]) //
    >> ')'    //
    )[(       //
    [](auto& attr, auto& ctx, bool& pass) {
      List* list = fu::at_c<0>(ctx.locals);
      fu::at_c<0>(ctx.attributes) = list;
    })];

  elem %= list | lexer.number;
}

int
main(int argc, char* argv[])
{
  Parser parser;

  std::string line;
  while (std::getline(std::cin, line)) {
    auto begin = line.begin();
    Object* obj;
    lex::tokenize_and_parse(begin, line.end(), parser.lexer, parser, obj);
    obj->print(std::cout);
    std::cout << std::endl;
  }
}

找到解决方法:使用 std::size_treinterpret_cast 替换指针类型:

#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iostream>
#include <string>
#include <vector>

class Object
{
public:
  virtual ~Object() = default;

public:
  virtual void print(std::ostream& out) = 0;
};

class Number : public Object
{
public:
  int64_t _val;

public:
  virtual void print(std::ostream& out) override { out << _val; }
};

class List : public Object
{
public:
  std::vector<Object*> _objs;

public:
  virtual void print(std::ostream& out) override
  {
    out << '(';
    for (auto&& i : _objs) {
      i->print(out);
      out << ' ';
    }
    out << ')';
  }
};

namespace qi = boost::spirit::qi;
namespace fu = boost::fusion;
namespace lex = boost::spirit::lex;

using lex::lexertl::actor_lexer;
using lex::lexertl::token;

using TokenTypes = boost::mpl::vector<std::size_t>;
using Iterator = std::string::iterator;

class Lexer : public lex::lexer<actor_lexer<token<Iterator, TokenTypes>>>
{
public:
  lex::token_def<> spaces;
  lex::token_def<std::size_t> number; // use std::size_t instead

public:
  Lexer();
};

template<typename... Ts>
using Rule = qi::rule<Lexer::iterator_type, Ts...>;

class Parser : public qi::grammar<Lexer::iterator_type, Object*>
{
public:
  Lexer lexer;

  Rule<Object*, qi::locals<List*>> list;
  Rule<Object*> elem;

public:
  Parser();
};

Lexer::Lexer()
{
  self += '(';
  self += ')';

  spaces = R"(\s+)";
  self +=
    spaces[([](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
      matched = lex::pass_flags::pass_ignore;
    })];

  number = R"(\d+)";
  self +=
    number[([](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
      auto val = new Number();
      auto iter = start;
      qi::parse(iter, end, qi::long_long, val->_val);
      ctx.set_value(reinterpret_cast<std::size_t>(val)); // cast here
    })];
}

Parser::Parser()
  : base_type(elem)
{
  list = (         //
    qi::lit('(')[( //
      [](auto& attr, auto& ctx, bool& pass) {
        fu::at_c<0>(ctx.locals) = new List();
      })]       //
    >> *(elem[( //
         [](auto& attr, auto& ctx, bool& pass) {
           List* list = fu::at_c<0>(ctx.locals);
           list->_objs.push_back(attr);
         })]) //
    >> ')'    //
    )[(       //
    [](auto& attr, auto& ctx, bool& pass) {
      List* list = fu::at_c<0>(ctx.locals);
      fu::at_c<0>(ctx.attributes) = list;
    })];

  elem %= list | qi::omit[lexer.number[([](auto& attr, auto& ctx, bool& pass) {
            fu::at_c<0>(ctx.attributes) = reinterpret_cast<Object*>(attr); // cast here
          })]];
}

int
main(int argc, char* argv[])
{
  Parser parser;

  std::string line;
  while (std::getline(std::cin, line)) {
    auto begin = line.begin();
    Object* obj;
    lex::tokenize_and_parse(begin, line.end(), parser.lexer, parser, obj);
    obj->print(std::cout);
    std::cout << std::endl;
  }
}

我觉得这真的很丑。谁有更好的解决方案???

好的。别小看这个。阅读您的示例(包括 self-contained 示例的荣誉!这节省了大量时间)我不禁觉得您不知何故偶然发现了 anti-patterns 中最糟糕的 cross-section在灵气中。

  1. 您正在使用多态 AST:

    • How can I use polymorphic attributes with boost::spirit::qi parsers?
  2. 您正在使用语义操作。通常这已经错过了嵌入式语法的最佳位置,这就是我链接 126 answers to Boost Spirit: "Semantic actions are evil"?.

    的原因

    然而,这甚至只是在谈论 Qi 的语义动作。您将它们用于 Lex:

    self +=
        spaces[([](auto& start, auto& end, auto& matched, auto& id,
                   auto& ctx) { matched = lex::pass_flags::pass_ignore; })];
    

    然后 not 使用 Phoenix 进一步复杂化,例如:

    self += spaces[lex::_pass = lex::pass_flags::pass_ignore];
    

    效果完全相同,但噪声减少了约 870%,邪恶魔法的数量相同。

  3. 另一个语义动作最重要:

    self += number[(
        [](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
            auto val = new Number();
            auto iter = start;
            qi::parse(iter, end, qi::long_long, val->_val);
            ctx.set_value(val);
        })];
    

    除了已经列出的所有问题之外,它还通过从 Lex 语义操作中调用 Qi 来从字面上制造分形。当然,这想是:

    self += number[lex::_val = phx::new_<Number>(/*magic*/)];
    

    但那种魔法并不存在。我的直觉是你的问题是 Lexer 根本不应该关注 AST 类型。在这一点上,我觉得词法分析器 could/should 应该是

    using TokenTypes = boost::mpl::vector<uint64_t>;
    using Iterator = std::string::const_iterator; // NOTE const_
    
    struct Lexer : lex::lexer<actor_lexer<token<Iterator, TokenTypes>>> {
        lex::token_def<>         spaces;
        lex::token_def<uint64_t> number;
    
        Lexer() : spaces{R"(\s+)"}, number{R"(\d+)"} {
            self += '(';
            self += ')';
            self += spaces[lex::_pass = lex::pass_flags::pass_ignore];
            self += number;
        }
    };
    

    也就是说,它是否应该存在。

这就是结构评估。让我按照相同的思路对 Qi 语法进行简化,这样我们就可以对代码进行推理:

struct Parser : qi::grammar<Lexer::iterator_type, Object*()> {
    Parser() : base_type(elem) {
        using namespace qi::labels;

        static constexpr qi::_a_type _list{};
        const auto _objs = phx::bind(&List::_objs, _list);

        list = (                               //
            '(' >>                             //
            *(elem[phx::push_back(_objs, _1)]) //
            >> ')'                             //
            )[_val = phx::new_<List>(_list)];

        elem                  //
            = list[_val = _1] //
            | lexer.number[_val = phx::new_<Number>(_1)];
    }

    Lexer lexer; // TODO FIXME excess scope

  private:
    using It = Lexer::iterator_type;
    qi::rule<It, Object*(), qi::locals<List>> list;
    qi::rule<It, Object*()>                    elem;
};

请注意我是如何使用本地 List 而不是 List* 来稍微减少内存泄漏的机会的。我想为了提高效率,你可以尝试让 Phoenix 为你做 move-semantics:

 [_val = phx::new_<List>(phx::static_cast_<List&&>(_list))];

但那时我不相信所有的表达式模板都可以做你想做的事情并去更详细的(即使假设 c++17):

phx::function move_new = [](List& l) { return new List(std::move(l)); };

list = (                               //
    '(' >>                             //
    *(elem[phx::push_back(_objs, _1)]) //
    >> ')'                             //
    )[_val = move_new(_list)];

现在我们得到了一个可行的演示:

Live On Coliru

int main() {
    Parser parser;

    for (std::string const line : {
             "",
             "42",
             "()",
             "(1 2 3)",
             "(1 (44 55 () 66) 3)",
         }) {
        auto    begin = line.begin();
        Object* obj = nullptr;
        if (lex::tokenize_and_parse(begin, line.end(), parser.lexer, parser,
                                    obj)) {
            obj->print(std::cout << std::quoted(line) << " -> ");
            delete obj;
        } else {
            std::cout << std::quoted(line) << " -> FAILED";
        }
        std::cout << std::endl;
    }
}

打印

"" -> FAILED
"42" -> 42
"()" -> ()
"(1 2 3)" -> (1 2 3 )
"(1 (44 55 () 66) 3)" -> (1 (44 55 () 66 ) 3 )

请注意,这个简单的测试程序 ALREADY 泄漏了 11 个对象,总共 224 个字节。这甚至不会使 error-handling 或回溯规则复杂化。

那是疯狂。您当然可以使用智能指针修复它,但这只会使一切变得更加复杂,同时确保性能会很差。

进一步简化

我会停止使用 Lex 和动态多态性:

没有更多莱克斯:

Lex 在这里添加的唯一“价值”是跳过空格。 Qi 非常有能力(请参阅 Boost spirit skipper issues 了解该主题的变体),因此我们将使用 skip(space)[] 代替:

Live On Coliru

#include <boost/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iomanip>
#include <iostream>
#include <string>
#include <vector>

struct Object {
    virtual ~Object() = default;
    virtual void print(std::ostream& out) const = 0;

    friend std::ostream& operator<<(std::ostream& os, Object const& o) { return o.print(os), os; }
};

struct Number : Object {
    Number(uint64_t v = 0) : _val(v) {}
    int64_t      _val;
    virtual void print(std::ostream& out) const override { out << _val; }
};

struct List : Object {
    std::vector<Object*> _objs;

    virtual void print(std::ostream& out) const override {
        out << '(';
        for (auto&& el : _objs)
            out << ' ' << *el;
        out << ')';
    }
};

namespace qi  = boost::spirit::qi;
namespace phx = boost::phoenix;

template <typename It>
struct Parser : qi::grammar<It, Object*()> {
    Parser() : Parser::base_type(start) {
        using namespace qi::labels;

        static constexpr qi::_a_type _list{};
        const auto _objs = phx::bind(&List::_objs, _list);

        phx::function move_new = [](List& l) { return new List(std::move(l)); };

        list = (                               //
            '(' >>                             //
            *(elem[phx::push_back(_objs, _1)]) //
            >> ')'                             //
            )[_val = move_new(_list)];

        elem                                          //
            = list[_val = _1]                         //
            | qi::uint_[_val = phx::new_<Number>(_1)] //
            ;

        start = qi::skip(qi::space)[elem];
    }

  private:
    qi::rule<It, Object*(), qi::space_type, qi::locals<List>> list;
    qi::rule<It, Object*(), qi::space_type> elem;

    // lexemes
    qi::rule<It, Object*()> start;
};

int main() {
    Parser<std::string::const_iterator> const parser;

    for (std::string const line : {
             "",
             "42",
             "()",
             "(1 2 3)",
             "(1 (44 55 () 66) 3)",
         }) {
        Object* obj = nullptr;
        if (parse(line.begin(), line.end(), parser >> qi::eoi, obj)) {
            std::cout << std::quoted(line) << " -> " << *obj;
        } else {
            std::cout << std::quoted(line) << " -> FAILED";
        }
        delete obj;
        std::cout << std::endl;
    }
}

仍然像 C++ 过时一样泄漏,但至少在减少 20 个 LoC 和一半编译时间的情况下这样做。

静态多态

隐藏所有原始指针内容(或完全避免它,具体取决于确切的 AST 要求):

using Number = uint64_t;
using Object = boost::make_recursive_variant< //
    Number,                                   //
    std::vector<boost::recursive_variant_>>::type;

using List   = std::vector<Object>;

For ease of supplying operator<< I moved them into an AST namespace below.

解析器下降到:

template <typename It> struct Parser : qi::grammar<It, AST::Object()> {
    Parser() : Parser::base_type(start) {
        list = '(' >> *elem >> ')';
        elem = list | qi::uint_;

        start = qi::skip(qi::space)[elem];
    }
  private:
    qi::rule<It, AST::List(), qi::space_type> list;
    qi::rule<It, AST::Object(), qi::space_type> elem;
    qi::rule<It, AST::Object()> start;
};

不再有 lex,不再有 phoenix,不再有泄漏,不再有手动语义操作。只是,富有表现力的代码。

现场演示

Live On Coliru

#include <boost/spirit/include/qi.hpp>
#include <iomanip>
#include <iostream>

namespace AST {
    struct Number {
        uint64_t v;
        Number(uint64_t v = 0) : v(v){};
    };

    using Object = boost::make_recursive_variant< //
        Number,                                   //
        std::vector<boost::recursive_variant_>>::type;

    using List = std::vector<Object>;

    std::ostream& operator<<(std::ostream& os, Number const& n) {
        return os << n.v;
    }
    std::ostream& operator<<(std::ostream& os, List const& l) {
        os << '(';
        for (auto& el : l)
            os << ' ' << el;
        return os << ')';
    }
} // namespace AST

namespace qi = boost::spirit::qi;

template <typename It> struct Parser : qi::grammar<It, AST::Object()> {
    Parser() : Parser::base_type(start) {
        list = '(' >> *elem >> ')';
        elem = list | qi::uint_;

        start = qi::skip(qi::space)[elem];
    }
  private:
    qi::rule<It, AST::List(), qi::space_type> list;
    qi::rule<It, AST::Object(), qi::space_type> elem;
    qi::rule<It, AST::Object()> start;
};

int main() {
    Parser<std::string::const_iterator> const parser;

    for (std::string const line : {
             "",
             "42",
             "()",
             "(1 2 3)",
             "(1 (44 55 () 66) 3)",
         }) {
        AST::Object obj;
        if (parse(line.begin(), line.end(), parser >> qi::eoi, obj))
            std::cout << std::quoted(line) << " -> " << obj << "\n";
        else
            std::cout << std::quoted(line) << " -> FAILED\n";
    }
}

版画

"" -> FAILED
"42" -> 42
"()" -> ()
"(1 2 3)" -> ( 1 2 3)
"(1 (44 55 () 66) 3)" -> ( 1 ( 44 55 () 66) 3)

但这一次,没有泄漏内存。而且,它现在的编译速度足以 Compiler Explorer can also handle it.