后续:使用 boost::spirit::qi 解析带分隔符的数字

Followup: Using boost::spirit::qi to parse numbers with separators

这是 的后续问题。


根据 sehe 的非常好的建议,我成功地进行了数字解析。然后我尝试更新它以拥有一个辅助解析器,该解析器处理带有可选符号的数字。第二次尝试失败了。我怀疑我在处理子语法方面做错了什么。代码如下:

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phoenix = boost::phoenix;

template <typename Iterator, typename Num>
struct unsigned_parser : qi::grammar<Iterator, Num()> {
    unsigned_parser() : unsigned_parser::base_type(start) {
        using qi::_val;
        using qi::_1;
        using qi::eps;
        using qi::debug;
        using ascii::char_;

        bin = eps[_val=0] >> *(char_("01")[_val = _val * 2 + dval(_1)] | '_');
        oct = eps[_val=0] >> *(char_("0-7")[_val = _val * 8 + dval(_1)] | '_');
        dec = eps[_val=0]
              >> *(char_("0-9")[_val = _val * 10 + dval(_1)] | '_');
        hex = eps[_val=0]
              >> *(char_("0-9a-fA-F")[_val = _val * 16 + dval(_1)] | '_');
        start = (char_('0') >>
                 ((char_("xXhH") >> hex[_val=_1])
                  | (char_("bByY") >> bin[_val=_1])
                  | (char_("oOqQ") >> oct[_val=_1])
                  | (char_("dDtT") >> dec[_val=_1])))
                | (hex[_val=_1] >> char_("xXhH"))
                | (bin[_val=_1] >> char_("bByY"))
                | (oct[_val=_1] >> char_("oOqQ"))
                | (dec[_val=_1] >> -char_("dDtT"));
        start.name("unum");
        hex.name("hex");
        oct.name("oct");
        dec.name("dec");
        bin.name("bin");

        debug(start);
        debug(hex);
        debug(oct);
        debug(dec);
        debug(bin);
    }
    qi::rule<Iterator, Num()> start;
    qi::rule<Iterator, Num()> hex;
    qi::rule<Iterator, Num()> oct;
    qi::rule<Iterator, Num()> dec;
    qi::rule<Iterator, Num()> bin;
    struct _dval {
        template <typename> struct result { typedef uint8_t type; };
        template <typename T> uint8_t operator()(T ch) const {
            if (ch >= '0' || ch <= '9') {
                return ch - '0';
            }
            ch = std::tolower(ch);
            if (ch >= 'a' || ch <= 'f') {
                return ch - 'a' + 10;
            }
            assert(false);
        }
    };
    boost::phoenix::function<_dval> dval;
};

template <typename Iterator, typename Num>
struct signed_parser : qi::grammar<Iterator, Num()> {
    signed_parser() : signed_parser::base_type(start) {
        using qi::eps;
        using qi::_val;
        using qi::_1;
        using ascii::char_;
        using phoenix::static_cast_;
        unum = unsigned_parser<Iterator, Num>();
        start = (char_('-') >> unum[_val=-_1])
                | (-char_('+') >> unum[_val=_1]);
        unum.name("unum");
        start.name("snum");
        debug(start);
        /* debug(unum); */
    }
    qi::rule<Iterator, Num()> start;
    qi::rule<Iterator, Num()> unum;
};

int main(int argv, const char *argc[]) {
    using phoenix::ref;
    using qi::eoi;
    using qi::_1;

    typedef std::string::const_iterator iter;
    signed_parser<iter, int64_t> sp;
    int64_t val;
    if (argv != 2) {
        std::cerr << "Usage: " << argc[0] << " <input>" << std::endl;
        return 1;
    }
    std::string test(argc[1]);
    iter i = test.begin();
    iter end = test.end();
    bool rv = phrase_parse(i, end, sp[ref(val)=_1] >> eoi, ascii::space);
    if (rv) {
        assert(i == end);
        std::cout << "Succeeded: " << val << std::endl;
        return 0;
    }
    std::cout << "Failed." << std::endl;
    return 1;
}

对于 signed_parser,每次解析都会失败。此外,如果我取消注释掉的 debug(),程序会出现段错误。

我觉得我已经接近开始理解如何使用它了,所以任何帮助将不胜感激。

使用所有这些单独的规则会扼杀编译器优化解析的机会。

您不能引用临时 grammar/rule。你需要有语法实例:

template <typename Iterator, typename Num>
struct signed_parser : qi::grammar<Iterator, Num()> {
    signed_parser() : signed_parser::base_type(snum) {
        using namespace qi;

        snum = lit('-') >> unum
            | -lit('+') >> unum
            ;

        BOOST_SPIRIT_DEBUG_NODES((snum))
    }
private:
    qi::rule<Iterator, Num()> snum;
    unsigned_parser<Iterator, Num> unum;
};

这里有一些清理工作:

  • 交换 argcargv 你会吗:)
  • 使用 BOOST_SPIRIT_DEBUG* 宏

    BOOST_SPIRIT_DEBUG_NODES((unum) (hex) (oct) (dec) (bin));
    
  • 如果 lit() 或(更糟!)char_()

  • ,请改用纯文字
  • 更喜欢使用自动属性传播 (Boost Spirit: "Semantic actions are evil"?)。例如。规则可以简单得多:

        snum = lit('-') >> unum
            | -lit('+') >> unum
            ;
    
  • 使用%=在存在语义动作的情况下保持自动传播:

        snum %= lit('-') >> unum [ _val = -_1 ]
             | -lit('+') >> unum
             ;
    
  • phrase_parse 调用本身也是如此:您可以传递属性的绑定引用。不需要语义动作

  • 执行 tolower(ch) 可能会更慢(因为您知道它是 ASCII),可能不正确(如果您的编译器已签署 char,您将获得符号扩展)

  • 更新 你的 dval 演员中有一个相当可怕的错误。范围检查是错误的!这是我的固定版本:

    struct accum_f {
        template <typename...> struct result { typedef void type; };
        void operator()(char ch, Num& accum, int base) const {
            accum *= base;
    
            if      (ch >= '0' && ch <= '9') accum += ch - '0';
            else if (ch >= 'a' && ch <= 'f') accum += ch - 'a' + 10;
            else if (ch >= 'A' && ch <= 'F') accum += ch - 'A' + 10;
            else assert(false);
        }
    };
    boost::phoenix::function<accum_f> _accum;
    

    请参阅下文了解语义操作的结果 changes/simplifications

  • 可以使用前缀分支的建筑int_parser;这可能会(多)快

  • 警告:当您编写 unum 无语义动作时,重要的是您不要 "capture" '0'qi::char_ 就像你一样。否则,您会想知道为什么任何前缀格式数字的结果总是 48

    unum = ('0' >>
                ( (omit[ char_("xXhH") ] >> hex)
                | (omit[ char_("bByY") ] >> bin)
                | (omit[ char_("oOqQ") ] >> oct)
                | (omit[ char_("dDtT") ] >> dec))
            )
        | (hex >> omit[  char_("xXhH") ])
        | (bin >> omit[  char_("bByY") ])
        | (oct >> omit[  char_("oOqQ") ])
        | (dec >> omit[ -char_("dDtT") ]);
    
  • 使用 phrase_parse 和船长只要您使用的解析器表达式不使用船长(请参阅 Boost spirit skipper issues

Live On Coliru

//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace qi      = boost::spirit::qi;
namespace ascii   = boost::spirit::ascii;

template <typename Iterator, typename Num>
struct unsigned_parser : qi::grammar<Iterator, Num()> {
    unsigned_parser() : unsigned_parser::base_type(unum) {
        using namespace qi;

        bin  = eps[_val=0] >> *(char_("01")        [ _accum(_1, _val, 2 )] | '_');
        oct  = eps[_val=0] >> *(char_("0-7")       [ _accum(_1, _val, 8 )] | '_');
        dec  = eps[_val=0] >> *(char_("0-9")       [ _accum(_1, _val, 10)] | '_');
        hex  = eps[_val=0] >> *(char_("0-9a-fA-F") [ _accum(_1, _val, 16)] | '_');
        unum = ('0' >>
                    ( (omit[ char_("xXhH") ] >> hex)
                    | (omit[ char_("bByY") ] >> bin)
                    | (omit[ char_("oOqQ") ] >> oct)
                    | (omit[ char_("dDtT") ] >> dec))
                )
            | (hex >> omit[  char_("xXhH") ])
            | (bin >> omit[  char_("bByY") ])
            | (oct >> omit[  char_("oOqQ") ])
            | (dec >> omit[ -char_("dDtT") ]);

        BOOST_SPIRIT_DEBUG_NODES((unum) (hex) (oct) (dec) (bin));
    }

  private:
    qi::rule<Iterator, Num()> unum,  hex, oct, dec, bin;

    struct accum_f {
        template <typename...> struct result { typedef void type; };
        void operator()(char ch, Num& accum, int base) const {
            accum *= base;

            if      (ch >= '0' && ch <= '9') accum += ch - '0';
            else if (ch >= 'a' && ch <= 'f') accum += ch - 'a' + 10;
            else if (ch >= 'A' && ch <= 'F') accum += ch - 'A' + 10;
            else assert(false);
        }
    };
    boost::phoenix::function<accum_f> _accum;
};

    template <typename Iterator, typename Num>
    struct signed_parser : qi::grammar<Iterator, Num()> {
        signed_parser() : signed_parser::base_type(snum) {
            using namespace qi;

            snum %= lit('-') >> unum [ _val = -_1 ]
                 | -lit('+') >> unum
                 ;

            BOOST_SPIRIT_DEBUG_NODES((snum))
        }
    private:
        qi::rule<Iterator, Num()> snum;
        unsigned_parser<Iterator, Num> unum;
    };

int main(int argc, const char *argv[]) {
    typedef std::string::const_iterator iter;
    signed_parser<iter, int64_t> const sp;

    for (std::string const& s : boost::make_iterator_range(argv+1, argv+argc))
    {
        std::cout << "\n-----------------------------\nParsing '" << s << "':\n";

        int64_t val;
        iter i = s.begin(), end = s.end();
        bool rv = phrase_parse(i, end, sp >> qi::eoi, ascii::space, val);

        if (rv) {
            std::cout << "Succeeded: " << val << std::endl;
        } else {
            std::cout << "Failed." << std::endl;
        }

        if (i!=end) {
            std::cout << "Remaining unparsed: '" << std::string(i,end) << "'\n";
        }
    }
}

输出:

-----------------------------
Parsing '-124_456d':
Succeeded: -124456

-----------------------------
Parsing '123_456D':
Succeeded: 123456

-----------------------------
Parsing '-123_456T':
Succeeded: -123456

-----------------------------
Parsing '123456t':
Succeeded: 123456

-----------------------------
Parsing '+1_bh':
Succeeded: 27

-----------------------------
Parsing '0_010Q':
Succeeded: 8

-----------------------------
Parsing '+1010_1010_0111_0111_b':
Succeeded: 43639

-----------------------------
Parsing '123_456':
Succeeded: 123456

-----------------------------
Parsing '-123456':
Succeeded: -123456

-----------------------------
Parsing '1_bh':
Succeeded: 27

-----------------------------
Parsing '-0_010Q':
Succeeded: -8

-----------------------------
Parsing '1010_1010_0111_0111_b':
Succeeded: 43639

-----------------------------
Parsing '+0d124_456':
Succeeded: 124456

-----------------------------
Parsing '0D123_456':
Succeeded: 123456

-----------------------------
Parsing '+0T123_456':
Succeeded: 123456

-----------------------------
Parsing '0t123456':
Succeeded: 123456

-----------------------------
Parsing '0h1_b':
Succeeded: 27

-----------------------------
Parsing '0Q0_010':
Succeeded: 8

-----------------------------
Parsing '0b1010_1010_0111_0111_':
Succeeded: 43639

-----------------------------
Parsing '06123_45':
Succeeded: 612345

-----------------------------
Parsing '0612345':
Succeeded: 612345

-----------------------------
Parsing '0h1_b':
Succeeded: 27

-----------------------------
Parsing '-0Q0_010':
Succeeded: -8

-----------------------------
Parsing '0b1010_1010_0111_0111_':
Succeeded: 43639