用 boost 精神完全解码 http header 值

decode http header value fully with boost spirit

再一次,我发现自己想要振奋精神。我又一次被它打败了。

A HTTP header value 采用一般形式:

text/html; q=1.0, text/*; q=0.8, image/gif; q=0.6, image/jpeg; q=0.6, image/*; q=0.5, */*; q=0.1

value *OWS [; *OWS name *OWS [= *OWS possibly_quoted_value] *OWS [...]] *OWS [ , <another value> ...]

所以在我看来,这个 header 解码为:

value[0]: 
  text/html
  params:
    name : q
    value : 1.0
value[1]:
  text/*
  params:
    name : q
    value : 0.8
...

等等。

我敢肯定,对于任何知道如何做的人来说,boost::spirit::qi 语法都是微不足道的。

恳请您的帮助。

例如,这里是解码 Content-Type header 的代码概要,它被限制为 type/subtype 形式的一个值,具有任意数量的参数表格 <sp> ; <sp> token=token|quoted_string

template<class Iter>
void parse(ContentType& ct, Iter first, Iter last)
{
    ct.mutable_type()->append(to_lower(consume_token(first, last)));
    consume_lit(first, last, '/');
    ct.mutable_subtype()->append(to_lower(consume_token(first, last)));
    while (first != last) {
        skipwhite(first, last);
        if (consume_char_if(first, last, ';'))
        {
            auto p = ct.add_parameters();
            skipwhite(first, last);
            p->set_name(to_lower(consume_token(first, last)));
            skipwhite(first, last);
            if (consume_char_if(first, last, '='))
            {
                skipwhite(first, last);
                p->set_value(consume_token_or_quoted(first, last));
            }
            else {
                // no value on this parameter
            }
        }
        else if (consume_char_if(first, last, ','))
        {
            // normally we should get the next value-token here but in the case of Content-Type
            // we must barf
            throw std::runtime_error("invalid use of ; in Content-Type");
        }
    }
}

ContentType& populate(ContentType& ct, const std::string& header_value)
{
    parse(ct, header_value.begin(), header_value.end());
    return ct;
}

好的,经过 24 小时的英勇奋斗(嗯,不是真的 - 更像是一遍又一遍地阅读手册......),我找到了 a 方法行得通。

我根本无法胜任boost::spirit。如果有人可以改进这个答案,请post。

此精神状态机采用 header 的值(带有一个可选参数化值)并将其转换为 content_type 结构。

我对 HTTP 标准的业余阅读表明一些 header 具有以下形式(space 此处表示任意数量的白色 space,值可能被引用或不被引用:

Header-Name: tokena/tokenb [; param1 = "value" [; param2 = value]...]

而其他人有更一般的形式:

Header-Name: token [; param1 = "value"[; param2 = value]...] [ , token ...]

此代码涵盖第一种情况 - 即 HTTP Content-Type header 值。我将需要扩展它以满足 Accept header(它可以用参数宣传多个值)——稍后会出现。

这是代码。请务必告诉我如何改进它!!

#define BOOST_SPIRIT_DEBUG
#include <gtest/gtest.h>
#include <boost/spirit/include/qi.hpp>
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_char.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <utility>
#include <vector>
#include <string>
#include <boost/variant.hpp>

namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;

using unary_parameter = std::string;

struct binary_parameter
{
    std::string name;
    std::string value;
};
BOOST_FUSION_ADAPT_STRUCT(binary_parameter,
                          (std::string, name)
                          (std::string, value))

using parameter = boost::variant<unary_parameter, binary_parameter>;

struct type_subtype
{
    std::string type;
    std::string subtype;
};
BOOST_FUSION_ADAPT_STRUCT(type_subtype,
                          (std::string, type)
                          (std::string, subtype))

using content_type_pair = std::pair<std::string, std::string>;

struct content_type
{
    type_subtype type;
    std::vector<parameter> params;
};

BOOST_FUSION_ADAPT_STRUCT(content_type,
                          (type_subtype, type)
                          (std::vector<parameter>, params))

template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{

    token_grammar() : token_grammar::base_type(content_type_rule)
    {
        using ascii::char_;
        using qi::omit;
        using qi::eoi;

        CR = char_('\r');
        LF = char_('\n');
        CRLF = CR >> LF;
        SP = char_(' ');
        HT = char_('\t');
        LWS = -CRLF >> +(SP | HT);

        UPALPHA = char_('A', 'Z');
        LOALPHA = char_('a', 'z');
        ALPHA = UPALPHA | LOALPHA;
        DIGIT = char_('0', '9');
        CTL = char_(0, 31) | char_(127);
        QUOT = char_('"');
        TEXT = (char_ - CTL) | HT;

        separator = char_('(') | ')' | '<' | '>' | '@'
        | ',' | ';' | ':' | '\' | '"'
        | '/' | '[' | ']' | '?' | '='
        | '{' | '}' | SP | HT;

        end_sequence = separator | space;
        token = +(char_ - separator);

        qdtext = char_ - char_('"') - '\';
        quoted_pair = omit[char_('\')] >> char_;
        quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
        value = quoted_string | token ;

        type_subtype_rule = token >> '/' >> token;
        name_only = token;
        nvp = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
        any_parameter = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
        content_type_rule = type_subtype_rule >> *any_parameter;

        BOOST_SPIRIT_DEBUG_NODES((qdtext)(quoted_pair)(quoted_string)(value)(token)(separator));
    }

    qi::rule<Iterator, void()> CR, LF, CRLF, SP, HT, LWS, CTL, QUOT;
    qi::rule<Iterator, char()> UPALPHA, LOALPHA, ALPHA, DIGIT, TEXT, qdtext, quoted_pair;
    qi::rule<Iterator, void()> separator, space, end_sequence;
    qi::rule<Iterator, std::string()> quoted_string, token, value;
    qi::rule<Iterator, type_subtype()> type_subtype_rule;
    qi::rule<Iterator, unary_parameter()> name_only;
    qi::rule<Iterator, binary_parameter()> nvp;
    qi::rule<Iterator, parameter()> any_parameter;
    qi::rule<Iterator, content_type()> content_type_rule;

};

TEST(spirit_test, test1)
{
    token_grammar<std::string::const_iterator> grammar{};

    std::string test = R"__test(application/json )__test";
    content_type ct;
    bool r = qi::parse(test.cbegin(), test.cend(), grammar, ct);
    EXPECT_EQ("application", ct.type.type);
    EXPECT_EQ("json", ct.type.subtype);
    EXPECT_EQ(0, ct.params.size());

    ct = {};
    test = R"__test(text/html ; charset = "ISO-8859-5")__test";
    qi::parse(test.cbegin(), test.cend(), grammar, ct);
    EXPECT_EQ("text", ct.type.type);
    EXPECT_EQ("html", ct.type.subtype);
    ASSERT_EQ(1, ct.params.size());
    ASSERT_EQ(typeid(binary_parameter), ct.params[0].type());
    auto& x = boost::get<binary_parameter>(ct.params[0]);
    EXPECT_EQ("charset", x.name);
    EXPECT_EQ("ISO-8859-5", x.value);

}

我已将代码视为 posted by OP 并对其进行了审核。

  1. 不需要指定void()。事实上,在这种情况下最好使用 qi::unused_type,如果没有声明属性类型,这就是规则的默认值。

  2. 如果您不想公开该属性,则不需要 char_。请改用 lit

  3. 不需要将每个字符解析器包装在 rule 中。这会损害性能。最好保留 proto 表达式树 un-evaluated,这样 Qi 可以更多地优化解析器表达式,并且编译器可以内联更多。

    此外,Qi 在属性上没有移动语义,因此避免冗余规则消除了在包含规则中串联的 sub-attributes 的冗余副本。

    Sample alternative spelling (caution, see Assigning parsers to auto variables)

    auto CR   = qi::lit('\r');
    auto LF   = qi::lit('\n');
    auto CRLF = qi::lit("\r\n");
    auto HT   = qi::lit('\t');
    auto SP   = qi::lit(' ');
    auto LWS  = qi::copy(-CRLF >> +(SP | HT)); // deepcopy
    
    UPALPHA = char_('A', 'Z');
    LOALPHA = char_('a', 'z');
    ALPHA   = UPALPHA | LOALPHA;
    DIGIT   = char_('0', '9');
    //CTL     = char_(0, 31) | char_(127);
    TEXT    = char_("\t\x20-\x7e\x80-\xff");
    
  4. 因为你不必使用char_,你也没有使用qi::omit[].

  5. 杀死属性
  6. 当您在 Qi 域表达式模板中时,原始 string/char 文字隐式包装在 qi::lit 中,因此,您可以简单地像

    quoted_pair   = omit[char_('\')] >> char_;
    quoted_string = omit[char_('"')] >> *(qdtext | quoted_pair) >> omit[char_('"')];
    

    只是

    quoted_pair   = '\' >> char_;
    quoted_string = '"' >> *(qdtext | quoted_pair) >> '"';
    
  7. 与其一直用 omit[*SP] 拼写跳过空格,不如用 skipper 声明规则。现在,您可以简化

    nvp               = token >> omit[*SP] >> omit['='] >> omit[*SP] >> value;
    any_parameter     = omit[*SP] >> omit[char_(';')] >> omit[*SP] >> (nvp | name_only);
    content_type_rule = type_subtype_rule >> *any_parameter;
    

    只是

    nvp               = token >> '=' >> value;
    any_parameter     = ';' >> (nvp | name_only);
    content_type_rule = type_subtype_rule >> qi::skip(spaces)[*any_parameter];
    

    Note that any subrule invocations of rules that are declared without a skipper are implicitly lexeme: Boost spirit skipper issues

  8. 有很多redundant/unusedheaders

  9. 最近的编译器 + 提升版本使 BOOST_FUSION_ADAPT_STRUCT 通过使用 decltype
  10. 变得更加简单

简化的结果噪音小得多:

//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapted.hpp>

struct parameter {
    boost::optional<std::string> name;
    std::string value;
};

struct type_subtype {
    std::string type;
    std::string subtype;
};

struct content_type {
    type_subtype type;
    std::vector<parameter> params;
};

BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)

template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
    token_grammar() : token_grammar::base_type(content_type_rule)
    {
        using qi::ascii::char_;

        spaces        = char_(' ');
        token         = +~char_( "()<>@,;:\\"/[]?={} \t");
        quoted_string = '"' >> *('\' >> char_ | ~char_('"')) >> '"';
        value         = quoted_string | token;

        type_subtype_rule = token >> '/' >> token;
        name_only         = token;
        nvp               = token >> '=' >> value;
        any_parameter     = ';' >> (nvp | name_only);
        content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];

        BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
    }

  private:
    using Skipper = qi::space_type;
    Skipper spaces;

    qi::rule<Iterator, binary_parameter(), Skipper> nvp;
    qi::rule<Iterator, parameter(), Skipper>        any_parameter;
    qi::rule<Iterator, content_type()>              content_type_rule;

    // lexemes
    qi::rule<Iterator, std::string()>               quoted_string, token, value;
    qi::rule<Iterator, type_subtype()>              type_subtype_rule;
    qi::rule<Iterator, unary_parameter()>           name_only;
};

看看Live On Coliru(测试用例相同)

奖金

在这种情况下,我更喜欢更简单的 AST。通过使用 qi::attr 注入一些属性值,您可以 avoid using boost::variant and/or even avoid boost::optional:

struct parameter {
    bool have_name;
    std::string name;
    std::string value;
};

struct type_subtype {
    std::string type;
    std::string subtype;
};

struct content_type {
    type_subtype type;
    std::vector<parameter> params;
};

BOOST_FUSION_ADAPT_STRUCT(parameter, have_name, name, value)
BOOST_FUSION_ADAPT_STRUCT(type_subtype, type, subtype)
BOOST_FUSION_ADAPT_STRUCT(content_type, type, params)

namespace qi = boost::spirit::qi;

template<class Iterator>
struct token_grammar : qi::grammar<Iterator, content_type()>
{
    token_grammar() : token_grammar::base_type(content_type_rule)
    {
        using qi::ascii::char_;

        spaces        = char_(' ');
        token         = +~char_( "()<>@,;:\\"/[]?={} \t");
        quoted_string = '"' >> *('\' >> char_ | ~char_('"')) >> '"';
        value         = quoted_string | token;

        type_subtype_rule = token >> '/' >> token;
        name_only         = qi::attr(false) >> qi::attr("") >> token;
        nvp               = qi::attr(true)  >> token >> '=' >> value;
        any_parameter     = ';' >> (nvp | name_only);
        content_type_rule = type_subtype_rule >> qi::skip(spaces) [*any_parameter];

        BOOST_SPIRIT_DEBUG_NODES((nvp)(any_parameter)(content_type_rule)(quoted_string)(token)(value)(type_subtype_rule))
    }

  private:
    using Skipper = qi::space_type;
    Skipper spaces;

    qi::rule<Iterator, parameter(), Skipper> nvp, name_only, any_parameter;
    qi::rule<Iterator, content_type()>       content_type_rule;

    // lexemes
    qi::rule<Iterator, std::string()>        quoted_string, token, value;
    qi::rule<Iterator, type_subtype()>       type_subtype_rule;
};