使用 Boost 精神解析任意精度数字

Parse arbitrary precision numbers with Boost spirit

我想编写一个 Boost Spirit Qi 解析器,它可以解析任意 C 整数文字(例如 12340x1234ULL)并将它们转换为任意精度的 llvm::APInt 值。


以后者为例,解析器需要识别标记 0xNN...NS,其中 N 是十六进制数字,S 是有效的文字后缀。

构建这样的解析器很容易,但我让它“丢弃”前缀和后缀,return 剩余数字转换为 llvm::APInt 值的方式与例如qi::uint_ return 无符号整数?

我知道有 qi::uint_parser 但 class 似乎非常有限,因为它似乎是从整数而不是字符串建立结果的。这是其他解析器生成器的主要功能,因此我很惊讶文档对此进行了掩盖。

我认为 解析器生成器的主要内容确实是解析成任意类型的整数。





如您所见,Spirit 可以做到这一点。让我们来演示一下基础知识。

Loosely after http://www.nongnu.org/hcb/#integer-literal

_suffix += "u", "l", "ll", "ul", "lu", "ull", "llu";

_start = qi::no_case[ // case insensitive
    ("0x"          >> qi::uint_parser<Integer, 16>{} |
     "0b"          >> qi::uint_parser<Integer, 2>{} |
     &qi::lit('0') >> qi::uint_parser<Integer, 8>{} |
                      qi::uint_parser<Integer, 10>{})
    // ignored for now:
    >> -_suffix];


观看演示 ​​Live On Compiler Explorer

template <typename Integer> void test() {
    std::cout << " ---- " << __PRETTY_FUNCTION__ << "\n";
    using It = std::string::const_iterator;
    IntLiteral<It, Integer> const parser {};

    for (std::string const input : {
         }) {
        Integer value;
        if (parse(input.begin(), input.end(), parser >> qi::eoi, value)) {
            std::cout << "Parsed " << std::quoted(input) << " -> " << value << "\n";
        } else {
            std::cout << "Failed to parse " << std::quoted(input) << "\n";

int main() {


 ---- void test() [with Integer = long unsigned int]
Parsed "1234" -> 1234
Parsed "1234u" -> 1234
Parsed "0x12f34ULL" -> 77620
Parsed "033ULL" -> 27
Parsed "0b101011l" -> 43
Parsed "33lu" -> 33
 ---- void test() [with Integer = boost::multiprecision::number<boost::multiprecision::backend
s::cpp_int_backend<1024, 1024, boost::multiprecision::signed_magnitude, boost::multiprecision:
:checked, void> >]
Parsed "1234" -> 1234
Parsed "1234u" -> 1234
Parsed "0x12f34ULL" -> 77620
Parsed "033ULL" -> 27
Parsed "0b101011l" -> 43
Parsed "33lu" -> 33



你可以在没有 LLVM 的情况下做到这一点。例如。首先解析为 intmax_t,然后根据后缀强制转换为适当的类型。


using CxxInteger = boost::variant<
    signed, unsigned, 
    signed long, unsigned long,
    signed long long, unsigned long long>;


using Raw = std::uintmax_t;

_start = no_case [ // case insensitive
    ("0x"       >> uint_parser<Raw, 16>{} |
     "0b"       >> uint_parser<Raw,  2>{} |
     &lit('0')  >> uint_parser<Raw,  8>{} |
                   uint_parser<Raw, 10>{})
    // ignored for now:
    >> _optsuffix
] [ _val = coerce_type(_1, _2) ];

_optsuffix = no_case[_suffix] | attr(Suffix::signed_);


struct converter_f {
    CxxInteger operator()(uintmax_t raw, Suffix sfx) const {
        switch (sfx) {
          case Suffix::signed_:   return static_cast<signed>(raw);
          case Suffix::unsigned_: return static_cast<unsigned>(raw);
          case Suffix::long_:     return static_cast<long>(raw);
          case Suffix::longlong_: return static_cast<long long>(raw);
          case Suffix::ul_:       return static_cast<unsigned long>(raw);
          case Suffix::ull_:      return static_cast<unsigned long long>(raw);
        throw std::invalid_argument("sfx");
boost::phoenix::function<converter_f> coerce_type;

就是这样。我们现在可以解析相同的测试用例 Live On Compiler Explorer

std::cout << "Parsed " << std::quoted(input) << " -> " << value
          << " (type #" << value.which() << " "
          << boost::core::demangle(value.type().name()) << ")\n";


 ---- void test()
Parsed "1234" -> 1234 (type #0 int)
Parsed "1234u" -> 1234 (type #1 unsigned int)
Parsed "0x12f34ULL" -> 77620 (type #5 unsigned long long)
Parsed "033ULL" -> 27 (type #5 unsigned long long)
Parsed "0b101011l" -> 43 (type #2 long)
Parsed "33lu" -> 33 (type #3 unsigned long)

3。应用于 LLVM APInt


struct converter_f {
    template <typename T> static auto as(uint64_t raw) {
        return llvm::APInt(raw, CHAR_BIT * sizeof(T), std::is_signed_v<T>);
    llvm::APInt operator()(uintmax_t raw, Suffix sfx) const {
        switch (sfx) {
        case Suffix::signed_:   return as<signed>(raw);
        case Suffix::unsigned_: return as<unsigned>(raw);
        case Suffix::long_:     return as<long>(raw);
        case Suffix::longlong_: return as<long long>(raw);
        case Suffix::ul_:       return as<unsigned long>(raw);
        case Suffix::ull_:      return as<unsigned long long>(raw);
        throw std::invalid_argument("sfx");


"Live" On Compiler Explorer

(编译器资源管理器不支持链接到 LLVM)

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <iomanip>
#include <llvm/ADT/APInt.h>
namespace qi = boost::spirit::qi;

template <typename It>
struct IntLiteral : qi::grammar<It, llvm::APInt()> {
    IntLiteral() : IntLiteral::base_type(_start) {
        using namespace qi;
        using Raw = std::uint64_t;

        _start = no_case [ // case insensitive
            ("0x"      >> uint_parser<Raw, 16>{} |
            "0b"      >> uint_parser<Raw,  2>{} |
            &lit('0') >> uint_parser<Raw,  8>{} |
                        uint_parser<Raw, 10>{})
            // ignored for now:
            >> _optsuffix
        ] [ _val = coerce_type(_1, _2) ];

        _optsuffix = no_case[_suffix] | attr(Suffix::signed_);

    enum class Suffix {
        signed_   = 0,
        unsigned_ = 1,
        long_     = 2,
        longlong_ = 4,

        l_   = long_,
        ll_  = longlong_,
        ul_  = unsigned_ | l_,
        ull_ = unsigned_ | ll_,

    struct suffix_sym : qi::symbols<char, Suffix> {
        suffix_sym() {
                ("u",   Suffix::unsigned_)
                ("l",   Suffix::l_)
                ("ll",  Suffix::ll_)
                ("ul",  Suffix::ul_)  ("lu",  Suffix::ul_)
                ("ull", Suffix::ull_) ("llu", Suffix::ull_)
    } _suffix;

    struct converter_f {
        template <typename T> static auto as(uint64_t raw) {
            return llvm::APInt(CHAR_BIT * sizeof(T), raw, std::is_signed_v<T>);
        llvm::APInt operator()(uint64_t raw, Suffix sfx) const {
            switch (sfx) {
            case Suffix::signed_:   return as<signed>(raw);
            case Suffix::unsigned_: return as<unsigned>(raw);
            case Suffix::long_:     return as<long>(raw);
            case Suffix::longlong_: return as<long long>(raw);
            case Suffix::ul_:       return as<unsigned long>(raw);
            case Suffix::ull_:      return as<unsigned long long>(raw);
            throw std::invalid_argument("sfx");
    boost::phoenix::function<converter_f> coerce_type;

    qi::rule<It, llvm::APInt()> _start;
    qi::rule<It, Suffix()>      _optsuffix;

void test() {
    std::cout << " ---- " << __PRETTY_FUNCTION__ << "\n";
    using It = std::string::const_iterator;
    IntLiteral<It> const parser {};

    for (std::string const input : {
        }) {
        llvm::APInt value;
        if (parse(input.begin(), input.end(), parser >> qi::eoi, value)) {
            std::cout << "Parsed " << std::quoted(input) << " -> "
                    << value.toString(10, false) // TODO signed?
                    << " bits:" << value.getBitWidth() << "\n";
        } else {
            std::cout << "Failed to parse " << std::quoted(input) << "\n";

int main() {


 ---- void test()
Parsed "1234" -> 1234 bits:32
Parsed "1234u" -> 1234 bits:32
Parsed "0x12f34ULL" -> 77620 bits:64
Parsed "033ULL" -> 27 bits:64
Parsed "0b101011l" -> 43 bits:64
Parsed "33lu" -> 33 bits:64


  • 当然,通过语义操作,您实际上可以使用 fromString 工厂方法

  • 不知如何准确的请教APInt是否签名。我怀疑我应该解析成 variant<APInt, APSInt> 以保留该信息

  • 我没有花时间检测溢出。第一个示例应该具有开箱即用的功能(感谢 Qi)

  • 我也没有投入精力支持 c++14 digit separators 因为它没有被指定。无论如何,它似乎都不是任何“主要”功能的一部分。