使用 Boost Spirit Qi 解析特定字符串
Parse a parcticular string using Boost Spirit Qi
我是 Boost Spirit 的新手,正在努力创建一个正确的表达式来解析以下输入(实际上是某些命令的标准输出的结果):
^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms
我需要将其解析为一组字符串和整数并记录在变量中。该行的大部分内容应该被解析为适当类型(字符串或整数)的变量。所以最后,我得到:
string: "^+", "line-17532.dyn.kponet.fi", "+1503us", "+9103us", "55ms"
int : 2, 7, 377, 1
一对
+1503us[+9103us]
也可以搭配space
+503us[ +103us]
我需要将方括号之前和方括号中的内容放在单独的字符串中。
此外,时间指定可以表示为
ns, ms, us, s
我很欣赏有关如何处理它的示例,因为可用的文档非常稀疏且不连贯。
大块日志,以及描述各个字段的标题:
MS Name/IP address Stratum Poll Reach LastRx Last sample
===============================================================================
^+ ns2.sdi.fi 2 9 377 381 -1476us[-1688us] +/- 72ms
^+ line-17532.dyn.kponet.fi 2 10 377 309 +302us[ +302us] +/- 59ms
^* heh.fi 2 10 377 319 -1171us[-1387us] +/- 50ms
^+ stara.mulimuli.fi 3 10 377 705 -1253us[-1446us] +/- 73ms
一如既往 我从绘制一个有用的 AST 开始:
namespace AST {
using clock = std::chrono::high_resolution_clock;
struct TimeSample {
enum Direction { up, down } direction; // + or -
clock::duration value;
};
struct Record {
std::string prefix; // "^+"
std::string fqdn; // "line-17532.dyn.kponet.fi"
int a, b, c, d; // 2, 7, 377, 1
TimeSample primary, braced;
clock::duration tolerance;
};
}
既然我们知道我们想要解析什么,我们主要只是用规则模仿 AST,有点:
using namespace qi;
start = skip(blank) [record_];
record_ = prefix_ >> fqdn_ >> int_ >> int_ >> int_ >> int_ >> sample_ >> '[' >> sample_ >> ']' >> tolerance_;
prefix_ = string("^+"); // or whatever you need to match here
fqdn_ = +graph; // or whatever additional constraints you have
sample_ = direction_ >> duration_;
duration_ = (long_ >> units_) [ _val = _1 * _2 ];
tolerance_= "+/-" >> duration_;
当然,有趣的是单位和方向:
struct directions : qi::symbols<char, AST::TimeSample::Direction> {
directions() { add("+", AST::TimeSample::up)("-", AST::TimeSample::down); }
} direction_;
struct units : qi::symbols<char, AST::clock::duration> {
units() {
using namespace std::literals::chrono_literals;
add("s", 1s)("ms", 1ms)("us", 1us)("µs", 1us)("ns", 1ns);
}
} units_;
白-space验收由船长管理;我选择 qi::blank_type
作为非词素规则:
using Skipper = qi::blank_type;
qi::rule<It, AST::Record()> start;
qi::rule<It, AST::Record(), Skipper> record_;
qi::rule<It, AST::TimeSample(), Skipper> sample_;
qi::rule<It, AST::clock::duration(), Skipper> duration_, tolerance_;
// lexemes:
qi::rule<It, std::string()> prefix_;
qi::rule<It, std::string()> fqdn_;
演示
综合使用:
int main() {
std::istringstream iss(R"(^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms
)");
std::string line;
while (getline(iss, line)) {
auto f = line.cbegin(), l = line.cend();
AST::Record record;
if (parse(f, l, parser<>{}, record))
std::cout << "parsed: " << boost::fusion::as_vector(record) << "\n";
else
std::cout << "parse error\n";
if (f!=l)
std::cout << "remaining unparsed input: '" << std::string(f,l) << "'\n";
}
}
parsed: (^+ line-17532.dyn.kponet.fi 2 7 377 1 +0.001503s +0.009103s 0.055s)
(下面的调试输出)
完整代码:
#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/adapted.hpp>
#include <sstream>
#include <chrono>
namespace std { namespace chrono {
// for debug
std::ostream& operator<<(std::ostream& os, duration<double> d) { return os << d.count() << "s"; }
} }
namespace AST {
using clock = std::chrono::high_resolution_clock;
struct TimeSample {
enum Direction { up, down } direction; // + or -
clock::duration value;
// for debug:
friend std::ostream& operator<<(std::ostream& os, Direction d) {
char const* signs[] = {"+","-"};
return os << signs[d];
}
friend std::ostream& operator<<(std::ostream& os, TimeSample const& sample) {
return os << sample.direction << std::chrono::duration<double>(sample.value).count() << "s";
}
};
struct Record {
std::string prefix; // "^+"
std::string fqdn; // "line-17532.dyn.kponet.fi"
int a, b, c, d; // 2, 7, 377, 1
TimeSample primary, braced;
clock::duration tolerance;
};
}
BOOST_FUSION_ADAPT_STRUCT(AST::Record, prefix, fqdn, a, b, c, d, primary, braced, tolerance)
BOOST_FUSION_ADAPT_STRUCT(AST::TimeSample, direction, value)
namespace qi = boost::spirit::qi;
template <typename It = std::string::const_iterator>
struct parser : qi::grammar<It, AST::Record()> {
parser() : parser::base_type(start) {
using namespace qi;
start = skip(blank) [record_];
record_ = prefix_ >> fqdn_ >> int_ >> int_ >> int_ >> int_ >> sample_ >> '[' >> sample_ >> ']' >> tolerance_;
prefix_ = string("^+"); // or whatever you need to match here
fqdn_ = +graph; // or whatever additional constraints you have
sample_ = direction_ >> duration_;
duration_ = (long_ >> units_) [ _val = _1 * _2 ];
tolerance_= "+/-" >> duration_;
BOOST_SPIRIT_DEBUG_NODES(
(start)(record_)
(prefix_)(fqdn_)(sample_)(duration_)(tolerance_)
)
}
private:
struct directions : qi::symbols<char, AST::TimeSample::Direction> {
directions() { add("+", AST::TimeSample::up)("-", AST::TimeSample::down); }
} direction_;
struct units : qi::symbols<char, AST::clock::duration> {
units() {
using namespace std::literals::chrono_literals;
add("s", 1s)("ms", 1ms)("us", 1us)("µs", 1us)("ns", 1ns);
}
} units_;
using Skipper = qi::blank_type;
qi::rule<It, AST::Record()> start;
qi::rule<It, AST::Record(), Skipper> record_;
qi::rule<It, AST::TimeSample(), Skipper> sample_;
qi::rule<It, AST::clock::duration(), Skipper> duration_, tolerance_;
// lexemes:
qi::rule<It, std::string()> prefix_;
qi::rule<It, std::string()> fqdn_;
};
int main() {
std::istringstream iss(R"(^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms
)");
std::string line;
while (getline(iss, line)) {
auto f = line.cbegin(), l = line.cend();
AST::Record record;
if (parse(f, l, parser<>{}, record))
std::cout << "parsed: " << boost::fusion::as_vector(record) << "\n";
else
std::cout << "parse error\n";
if (f!=l)
std::cout << "remaining unparsed input: '" << std::string(f,l) << "'\n";
}
}
调试输出
<start>
<try>^+ line-17532.dyn.kp</try>
<record_>
<try>^+ line-17532.dyn.kp</try>
<prefix_>
<try>^+ line-17532.dyn.kp</try>
<success> line-17532.dyn.kpon</success>
<attributes>[[^, +]]</attributes>
</prefix_>
<fqdn_>
<try>line-17532.dyn.kpone</try>
<success> 2 7 377 </success>
<attributes>[[l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i]]</attributes>
</fqdn_>
<sample_>
<try> +1503us[+9103us] </try>
<duration_>
<try>1503us[+9103us] +/- </try>
<success>[+9103us] +/- 55ms</success>
<attributes>[0.001503s]</attributes>
</duration_>
<success>[+9103us] +/- 55ms</success>
<attributes>[[+, 0.001503s]]</attributes>
</sample_>
<sample_>
<try>+9103us] +/- 55ms</try>
<duration_>
<try>9103us] +/- 55ms</try>
<success>] +/- 55ms</success>
<attributes>[0.009103s]</attributes>
</duration_>
<success>] +/- 55ms</success>
<attributes>[[+, 0.009103s]]</attributes>
</sample_>
<tolerance_>
<try> +/- 55ms</try>
<duration_>
<try> 55ms</try>
<success></success>
<attributes>[0.055s]</attributes>
</duration_>
<success></success>
<attributes>[0.055s]</attributes>
</tolerance_>
<success></success>
<attributes>[[[^, +], [l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i], 2, 7, 377, 1, [+, 0.001503s], [+, 0.009103s], 0.055s]]</attributes>
</record_>
<success></success>
<attributes>[[[^, +], [l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i], 2, 7, 377, 1, [+, 0.001503s], [+, 0.009103s], 0.055s]]</attributes>
</start>
注意:此答案显示了一种更简单的方法,为 sehe 显示的其他技术奠定了基础。
前言
让我们启用 Spirit 调试输出,这样我们就可以在开发它们时跟踪解析的进度。
#define BOOST_SPIRIT_DEBUG 1
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
namespace qi = boost::spirit::qi;
日志条目数据结构
第一步是定义一个结构来保存已解析的日志条目。
struct log_entry_t
{
std::string element_0;
std::string element_1;
uint32_t element_2;
uint32_t element_3;
uint32_t element_4;
uint32_t element_5;
std::string element_6;
std::string element_7;
std::string element_8;
};
调整数据结构
为了能够将结构用作 Spirit 语法的属性,我们需要将其调整为融合元组。 (更多信息在 one of Spirit tutorials) This is achieved using BOOST_FUSION_ADAPT_STRUCT
.
BOOST_FUSION_ADAPT_STRUCT(
log_entry_t
, (std::string, element_0)
, (std::string, element_1)
, (uint32_t, element_2)
, (uint32_t, element_3)
, (uint32_t, element_4)
, (uint32_t, element_5)
, (std::string, element_6)
, (std::string, element_7)
, (std::string, element_8)
)
日志行语法
接下来,我们定义日志条目的语法。由于各个条目可能由空格分隔,我们想使用短语解析,因此需要指定一个跳过解析器。 qi::blank_type
是一个合适的船长,因为它只匹配空格和制表符。
然而,所有的元素都应该被视为词素,我们没有为它们的规则指定任何船长。
template <typename Iterator>
struct log_line_parser
: qi::grammar<Iterator, log_entry_t(), qi::blank_type>
{
typedef qi::blank_type skipper_t;
log_line_parser()
: log_line_parser::base_type(log_line)
{
element_0 %= qi::string("^+");
element_1 %= qi::raw[(+qi::char_("-a-zA-Z0-9") % qi::char_('.'))];
element_2 %= qi::uint_;
element_3 %= qi::uint_;
element_4 %= qi::uint_;
element_5 %= qi::uint_;
element_6 %= qi::raw[qi::char_('+') >> qi::uint_ >> time_unit];
element_7 %= qi::raw[qi::char_('+') >> qi::uint_ >> time_unit];
element_8 %= qi::raw[qi::uint_ >> time_unit];
time_unit %= -qi::char_("nmu") >> qi::char_('s');
log_line
%= element_0
>> element_1
>> element_2
>> element_3
>> element_4
>> element_5
>> element_6
>> qi::lit('[') >> element_7 >> qi::lit(']')
>> qi::lit("+/-")
>> element_8
;
init_debug();
}
void init_debug()
{
BOOST_SPIRIT_DEBUG_NODE(element_0);
BOOST_SPIRIT_DEBUG_NODE(element_1);
BOOST_SPIRIT_DEBUG_NODE(element_2);
BOOST_SPIRIT_DEBUG_NODE(element_3);
BOOST_SPIRIT_DEBUG_NODE(element_4);
BOOST_SPIRIT_DEBUG_NODE(element_5);
BOOST_SPIRIT_DEBUG_NODE(element_6);
BOOST_SPIRIT_DEBUG_NODE(element_7);
BOOST_SPIRIT_DEBUG_NODE(element_8);
BOOST_SPIRIT_DEBUG_NODE(time_unit);
BOOST_SPIRIT_DEBUG_NODE(log_line);
}
private:
qi::rule<Iterator, std::string()> element_0;
qi::rule<Iterator, std::string()> element_1;
qi::rule<Iterator, uint32_t()> element_2;
qi::rule<Iterator, uint32_t()> element_3;
qi::rule<Iterator, uint32_t()> element_4;
qi::rule<Iterator, uint32_t()> element_5;
qi::rule<Iterator, std::string()> element_6;
qi::rule<Iterator, std::string()> element_7;
qi::rule<Iterator, std::string()> element_8;
qi::rule<Iterator, std::string()> time_unit;
qi::rule<Iterator, log_entry_t(), skipper_t> log_line;
};
让我们来看看一些规则:
元素 0 - 这是我们需要匹配的简单字符串。由于我们也希望捕获它,因此我们需要使用 string
parser.
元素 1 - 我们可以使用 char_
parser to match either a single character or a character set. The +
parser operator represents repetition, and the %
(list) parser operator 让我们解析由分隔符(在我们的例子中是一个点)分隔的几个重复项。
元素 2 - 要解析数字,我们可以使用现有的 numeric parsers.
元素 6 - 因为我们想要捕获字符串中的整个序列,所以我们使用 raw
parser directive
为了确定使用解析器运算符时得到的属性类型,请参考compound attribute rules的参考。
测试函数
bool test(std::string const& log)
{
std::cout << "Parsing: " << log << "\n\n";
std::string::const_iterator iter(log.begin());
std::string::const_iterator end(log.end());
log_line_parser<std::string::const_iterator> g;
log_entry_t entry;
bool r(qi::phrase_parse(iter, end, g, qi::blank, entry));
std::cout << "-------------------------\n";
if (r && (iter == end)) {
std::cout << "Parsing succeeded\n";
std::cout << entry.element_0 << "\n"
<< entry.element_1 << "\n"
<< entry.element_2 << "\n"
<< entry.element_3 << "\n"
<< entry.element_4 << "\n"
<< entry.element_5 << "\n"
<< entry.element_6 << "\n"
<< entry.element_7 << "\n"
<< entry.element_8 << "\n";
} else {
std::string::const_iterator some = iter + 30;
std::string context(iter, (some > end) ? end : some);
std::cout << "Parsing failed\n";
std::cout << "stopped at: \": " << context << "...\"\n";
}
return r;
}
主要功能
最后,让我们运行对我们的解析器进行一些正面和负面测试。
int main()
{
bool result(true);
result &= test("^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms");
result &= test("^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[ +9103us] +/- 55ms");
result &= test("^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503ms[+9103ns] +/- 55s");
result &= !test("^- line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55 ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 377 1 + 1503us[+9103us] +/- 55ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 +377 1 +1503us[+9103us] +/- 55ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 3 77 1 +1503us[+9103us] +/- 55ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 -377 1 +1503us[+9103us] +/- 55ms");
std::cout << "Test result = " << result << "\n";
return 0;
}
经过大量调试输出(第一次测试的例子):
Parsing: ^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms
<log_line>
<try>^+ line-17532.dyn.kp</try>
<element_0>
<try>^+ line-17532.dyn.kp</try>
<success> line-17532.dyn.kpon</success>
<attributes>[[^, +]]</attributes>
</element_0>
<element_1>
<try>line-17532.dyn.kpone</try>
<success> 2 7 377 </success>
<attributes>[[l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i]]</attributes>
</element_1>
<element_2>
<try>2 7 377 1 </try>
<success> 7 377 1 +</success>
<attributes>[2]</attributes>
</element_2>
<element_3>
<try>7 377 1 +150</try>
<success> 377 1 +1503</success>
<attributes>[7]</attributes>
</element_3>
<element_4>
<try>377 1 +1503us[</try>
<success> 1 +1503us[+91</success>
<attributes>[377]</attributes>
</element_4>
<element_5>
<try>1 +1503us[+9103us]</try>
<success> +1503us[+9103us] </success>
<attributes>[1]</attributes>
</element_5>
<element_6>
<try>+1503us[+9103us] +/-</try>
<time_unit>
<try>us[+9103us] +/- 55</try>
<success>[+9103us] +/- 55ms</success>
<attributes>[[u, s]]</attributes>
</time_unit>
<success>[+9103us] +/- 55ms</success>
<attributes>[[+, 1, 5, 0, 3, u, s]]</attributes>
</element_6>
<element_7>
<try>+9103us] +/- 55ms</try>
<time_unit>
<try>us] +/- 55ms</try>
<success>] +/- 55ms</success>
<attributes>[[u, s]]</attributes>
</time_unit>
<success>] +/- 55ms</success>
<attributes>[[+, 9, 1, 0, 3, u, s]]</attributes>
</element_7>
<element_8>
<try>55ms</try>
<time_unit>
<try>ms</try>
<success></success>
<attributes>[[m, s]]</attributes>
</time_unit>
<success></success>
<attributes>[[5, 5, m, s]]</attributes>
</element_8>
<success></success>
<attributes>[[[^, +], [l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i], 2, 7, 377, 1, [+, 1, 5, 0, 3, u, s], [+, 9, 1, 0, 3, u, s], [5, 5, m, s]]]</attributes>
</log_line>
-------------------------
Parsing succeeded
^+
line-17532.dyn.kponet.fi
2
7
377
1
+1503us
+9103us
55ms
程序打印以下行:
Test result = 1
这是我几乎可以对那些声称 C++ 只是增加了复杂性并且 C 确实更好的人表示同情的时候之一。它确实失去了一些东西,比如类型安全,但考虑一下用 C 的 scanf
:
读起来是什么样子的
struct record {
char prefix[256];
char url[256];
int a, b, c, d;
char time1[256];
char time2[256];
char time3[256];
};
sscanf(input,
"%255s %255s %d %d %d %d %255[^[][ %255[^]]] +/- %255s",
r.prefix, r.url, &r.a, &r.b, &r.c, &r.d, r.time1, r.time2, r.time3);
当然,这确实有一些潜在的责任:
- 它读入 char 数组而不是
std::string
s。
scanf
和表兄弟不是类型安全的。
- 它不会尝试验证时间后缀。
- 基于 Spirit 的解析器可能很容易至少快一些。
如果这些中的任何一个对于您的目的来说确实是一个严重的问题,您可能真的需要一种不同的方法。考虑到代码 看起来 可能打算执行的操作,但它们中的任何一个都可能导致真正的问题并不是很明显。
我是 Boost Spirit 的新手,正在努力创建一个正确的表达式来解析以下输入(实际上是某些命令的标准输出的结果):
^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms
我需要将其解析为一组字符串和整数并记录在变量中。该行的大部分内容应该被解析为适当类型(字符串或整数)的变量。所以最后,我得到:
string: "^+", "line-17532.dyn.kponet.fi", "+1503us", "+9103us", "55ms"
int : 2, 7, 377, 1
一对
+1503us[+9103us]
也可以搭配space
+503us[ +103us]
我需要将方括号之前和方括号中的内容放在单独的字符串中。
此外,时间指定可以表示为
ns, ms, us, s
我很欣赏有关如何处理它的示例,因为可用的文档非常稀疏且不连贯。
大块日志,以及描述各个字段的标题:
MS Name/IP address Stratum Poll Reach LastRx Last sample
===============================================================================
^+ ns2.sdi.fi 2 9 377 381 -1476us[-1688us] +/- 72ms
^+ line-17532.dyn.kponet.fi 2 10 377 309 +302us[ +302us] +/- 59ms
^* heh.fi 2 10 377 319 -1171us[-1387us] +/- 50ms
^+ stara.mulimuli.fi 3 10 377 705 -1253us[-1446us] +/- 73ms
一如既往 我从绘制一个有用的 AST 开始:
namespace AST {
using clock = std::chrono::high_resolution_clock;
struct TimeSample {
enum Direction { up, down } direction; // + or -
clock::duration value;
};
struct Record {
std::string prefix; // "^+"
std::string fqdn; // "line-17532.dyn.kponet.fi"
int a, b, c, d; // 2, 7, 377, 1
TimeSample primary, braced;
clock::duration tolerance;
};
}
既然我们知道我们想要解析什么,我们主要只是用规则模仿 AST,有点:
using namespace qi;
start = skip(blank) [record_];
record_ = prefix_ >> fqdn_ >> int_ >> int_ >> int_ >> int_ >> sample_ >> '[' >> sample_ >> ']' >> tolerance_;
prefix_ = string("^+"); // or whatever you need to match here
fqdn_ = +graph; // or whatever additional constraints you have
sample_ = direction_ >> duration_;
duration_ = (long_ >> units_) [ _val = _1 * _2 ];
tolerance_= "+/-" >> duration_;
当然,有趣的是单位和方向:
struct directions : qi::symbols<char, AST::TimeSample::Direction> {
directions() { add("+", AST::TimeSample::up)("-", AST::TimeSample::down); }
} direction_;
struct units : qi::symbols<char, AST::clock::duration> {
units() {
using namespace std::literals::chrono_literals;
add("s", 1s)("ms", 1ms)("us", 1us)("µs", 1us)("ns", 1ns);
}
} units_;
白-space验收由船长管理;我选择 qi::blank_type
作为非词素规则:
using Skipper = qi::blank_type;
qi::rule<It, AST::Record()> start;
qi::rule<It, AST::Record(), Skipper> record_;
qi::rule<It, AST::TimeSample(), Skipper> sample_;
qi::rule<It, AST::clock::duration(), Skipper> duration_, tolerance_;
// lexemes:
qi::rule<It, std::string()> prefix_;
qi::rule<It, std::string()> fqdn_;
演示
综合使用:
int main() {
std::istringstream iss(R"(^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms
)");
std::string line;
while (getline(iss, line)) {
auto f = line.cbegin(), l = line.cend();
AST::Record record;
if (parse(f, l, parser<>{}, record))
std::cout << "parsed: " << boost::fusion::as_vector(record) << "\n";
else
std::cout << "parse error\n";
if (f!=l)
std::cout << "remaining unparsed input: '" << std::string(f,l) << "'\n";
}
}
parsed: (^+ line-17532.dyn.kponet.fi 2 7 377 1 +0.001503s +0.009103s 0.055s)
(下面的调试输出)
完整代码:
#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/adapted.hpp>
#include <sstream>
#include <chrono>
namespace std { namespace chrono {
// for debug
std::ostream& operator<<(std::ostream& os, duration<double> d) { return os << d.count() << "s"; }
} }
namespace AST {
using clock = std::chrono::high_resolution_clock;
struct TimeSample {
enum Direction { up, down } direction; // + or -
clock::duration value;
// for debug:
friend std::ostream& operator<<(std::ostream& os, Direction d) {
char const* signs[] = {"+","-"};
return os << signs[d];
}
friend std::ostream& operator<<(std::ostream& os, TimeSample const& sample) {
return os << sample.direction << std::chrono::duration<double>(sample.value).count() << "s";
}
};
struct Record {
std::string prefix; // "^+"
std::string fqdn; // "line-17532.dyn.kponet.fi"
int a, b, c, d; // 2, 7, 377, 1
TimeSample primary, braced;
clock::duration tolerance;
};
}
BOOST_FUSION_ADAPT_STRUCT(AST::Record, prefix, fqdn, a, b, c, d, primary, braced, tolerance)
BOOST_FUSION_ADAPT_STRUCT(AST::TimeSample, direction, value)
namespace qi = boost::spirit::qi;
template <typename It = std::string::const_iterator>
struct parser : qi::grammar<It, AST::Record()> {
parser() : parser::base_type(start) {
using namespace qi;
start = skip(blank) [record_];
record_ = prefix_ >> fqdn_ >> int_ >> int_ >> int_ >> int_ >> sample_ >> '[' >> sample_ >> ']' >> tolerance_;
prefix_ = string("^+"); // or whatever you need to match here
fqdn_ = +graph; // or whatever additional constraints you have
sample_ = direction_ >> duration_;
duration_ = (long_ >> units_) [ _val = _1 * _2 ];
tolerance_= "+/-" >> duration_;
BOOST_SPIRIT_DEBUG_NODES(
(start)(record_)
(prefix_)(fqdn_)(sample_)(duration_)(tolerance_)
)
}
private:
struct directions : qi::symbols<char, AST::TimeSample::Direction> {
directions() { add("+", AST::TimeSample::up)("-", AST::TimeSample::down); }
} direction_;
struct units : qi::symbols<char, AST::clock::duration> {
units() {
using namespace std::literals::chrono_literals;
add("s", 1s)("ms", 1ms)("us", 1us)("µs", 1us)("ns", 1ns);
}
} units_;
using Skipper = qi::blank_type;
qi::rule<It, AST::Record()> start;
qi::rule<It, AST::Record(), Skipper> record_;
qi::rule<It, AST::TimeSample(), Skipper> sample_;
qi::rule<It, AST::clock::duration(), Skipper> duration_, tolerance_;
// lexemes:
qi::rule<It, std::string()> prefix_;
qi::rule<It, std::string()> fqdn_;
};
int main() {
std::istringstream iss(R"(^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms
)");
std::string line;
while (getline(iss, line)) {
auto f = line.cbegin(), l = line.cend();
AST::Record record;
if (parse(f, l, parser<>{}, record))
std::cout << "parsed: " << boost::fusion::as_vector(record) << "\n";
else
std::cout << "parse error\n";
if (f!=l)
std::cout << "remaining unparsed input: '" << std::string(f,l) << "'\n";
}
}
调试输出
<start>
<try>^+ line-17532.dyn.kp</try>
<record_>
<try>^+ line-17532.dyn.kp</try>
<prefix_>
<try>^+ line-17532.dyn.kp</try>
<success> line-17532.dyn.kpon</success>
<attributes>[[^, +]]</attributes>
</prefix_>
<fqdn_>
<try>line-17532.dyn.kpone</try>
<success> 2 7 377 </success>
<attributes>[[l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i]]</attributes>
</fqdn_>
<sample_>
<try> +1503us[+9103us] </try>
<duration_>
<try>1503us[+9103us] +/- </try>
<success>[+9103us] +/- 55ms</success>
<attributes>[0.001503s]</attributes>
</duration_>
<success>[+9103us] +/- 55ms</success>
<attributes>[[+, 0.001503s]]</attributes>
</sample_>
<sample_>
<try>+9103us] +/- 55ms</try>
<duration_>
<try>9103us] +/- 55ms</try>
<success>] +/- 55ms</success>
<attributes>[0.009103s]</attributes>
</duration_>
<success>] +/- 55ms</success>
<attributes>[[+, 0.009103s]]</attributes>
</sample_>
<tolerance_>
<try> +/- 55ms</try>
<duration_>
<try> 55ms</try>
<success></success>
<attributes>[0.055s]</attributes>
</duration_>
<success></success>
<attributes>[0.055s]</attributes>
</tolerance_>
<success></success>
<attributes>[[[^, +], [l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i], 2, 7, 377, 1, [+, 0.001503s], [+, 0.009103s], 0.055s]]</attributes>
</record_>
<success></success>
<attributes>[[[^, +], [l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i], 2, 7, 377, 1, [+, 0.001503s], [+, 0.009103s], 0.055s]]</attributes>
</start>
注意:此答案显示了一种更简单的方法,为 sehe 显示的其他技术奠定了基础。
前言
让我们启用 Spirit 调试输出,这样我们就可以在开发它们时跟踪解析的进度。
#define BOOST_SPIRIT_DEBUG 1
#include <boost/spirit/include/qi.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
namespace qi = boost::spirit::qi;
日志条目数据结构
第一步是定义一个结构来保存已解析的日志条目。
struct log_entry_t
{
std::string element_0;
std::string element_1;
uint32_t element_2;
uint32_t element_3;
uint32_t element_4;
uint32_t element_5;
std::string element_6;
std::string element_7;
std::string element_8;
};
调整数据结构
为了能够将结构用作 Spirit 语法的属性,我们需要将其调整为融合元组。 (更多信息在 one of Spirit tutorials) This is achieved using BOOST_FUSION_ADAPT_STRUCT
.
BOOST_FUSION_ADAPT_STRUCT(
log_entry_t
, (std::string, element_0)
, (std::string, element_1)
, (uint32_t, element_2)
, (uint32_t, element_3)
, (uint32_t, element_4)
, (uint32_t, element_5)
, (std::string, element_6)
, (std::string, element_7)
, (std::string, element_8)
)
日志行语法
接下来,我们定义日志条目的语法。由于各个条目可能由空格分隔,我们想使用短语解析,因此需要指定一个跳过解析器。 qi::blank_type
是一个合适的船长,因为它只匹配空格和制表符。
然而,所有的元素都应该被视为词素,我们没有为它们的规则指定任何船长。
template <typename Iterator>
struct log_line_parser
: qi::grammar<Iterator, log_entry_t(), qi::blank_type>
{
typedef qi::blank_type skipper_t;
log_line_parser()
: log_line_parser::base_type(log_line)
{
element_0 %= qi::string("^+");
element_1 %= qi::raw[(+qi::char_("-a-zA-Z0-9") % qi::char_('.'))];
element_2 %= qi::uint_;
element_3 %= qi::uint_;
element_4 %= qi::uint_;
element_5 %= qi::uint_;
element_6 %= qi::raw[qi::char_('+') >> qi::uint_ >> time_unit];
element_7 %= qi::raw[qi::char_('+') >> qi::uint_ >> time_unit];
element_8 %= qi::raw[qi::uint_ >> time_unit];
time_unit %= -qi::char_("nmu") >> qi::char_('s');
log_line
%= element_0
>> element_1
>> element_2
>> element_3
>> element_4
>> element_5
>> element_6
>> qi::lit('[') >> element_7 >> qi::lit(']')
>> qi::lit("+/-")
>> element_8
;
init_debug();
}
void init_debug()
{
BOOST_SPIRIT_DEBUG_NODE(element_0);
BOOST_SPIRIT_DEBUG_NODE(element_1);
BOOST_SPIRIT_DEBUG_NODE(element_2);
BOOST_SPIRIT_DEBUG_NODE(element_3);
BOOST_SPIRIT_DEBUG_NODE(element_4);
BOOST_SPIRIT_DEBUG_NODE(element_5);
BOOST_SPIRIT_DEBUG_NODE(element_6);
BOOST_SPIRIT_DEBUG_NODE(element_7);
BOOST_SPIRIT_DEBUG_NODE(element_8);
BOOST_SPIRIT_DEBUG_NODE(time_unit);
BOOST_SPIRIT_DEBUG_NODE(log_line);
}
private:
qi::rule<Iterator, std::string()> element_0;
qi::rule<Iterator, std::string()> element_1;
qi::rule<Iterator, uint32_t()> element_2;
qi::rule<Iterator, uint32_t()> element_3;
qi::rule<Iterator, uint32_t()> element_4;
qi::rule<Iterator, uint32_t()> element_5;
qi::rule<Iterator, std::string()> element_6;
qi::rule<Iterator, std::string()> element_7;
qi::rule<Iterator, std::string()> element_8;
qi::rule<Iterator, std::string()> time_unit;
qi::rule<Iterator, log_entry_t(), skipper_t> log_line;
};
让我们来看看一些规则:
元素 0 - 这是我们需要匹配的简单字符串。由于我们也希望捕获它,因此我们需要使用
string
parser.元素 1 - 我们可以使用
char_
parser to match either a single character or a character set. The+
parser operator represents repetition, and the%
(list) parser operator 让我们解析由分隔符(在我们的例子中是一个点)分隔的几个重复项。元素 2 - 要解析数字,我们可以使用现有的 numeric parsers.
元素 6 - 因为我们想要捕获字符串中的整个序列,所以我们使用
raw
parser directive
为了确定使用解析器运算符时得到的属性类型,请参考compound attribute rules的参考。
测试函数
bool test(std::string const& log)
{
std::cout << "Parsing: " << log << "\n\n";
std::string::const_iterator iter(log.begin());
std::string::const_iterator end(log.end());
log_line_parser<std::string::const_iterator> g;
log_entry_t entry;
bool r(qi::phrase_parse(iter, end, g, qi::blank, entry));
std::cout << "-------------------------\n";
if (r && (iter == end)) {
std::cout << "Parsing succeeded\n";
std::cout << entry.element_0 << "\n"
<< entry.element_1 << "\n"
<< entry.element_2 << "\n"
<< entry.element_3 << "\n"
<< entry.element_4 << "\n"
<< entry.element_5 << "\n"
<< entry.element_6 << "\n"
<< entry.element_7 << "\n"
<< entry.element_8 << "\n";
} else {
std::string::const_iterator some = iter + 30;
std::string context(iter, (some > end) ? end : some);
std::cout << "Parsing failed\n";
std::cout << "stopped at: \": " << context << "...\"\n";
}
return r;
}
主要功能
最后,让我们运行对我们的解析器进行一些正面和负面测试。
int main()
{
bool result(true);
result &= test("^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms");
result &= test("^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[ +9103us] +/- 55ms");
result &= test("^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503ms[+9103ns] +/- 55s");
result &= !test("^- line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55 ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 377 1 + 1503us[+9103us] +/- 55ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 +377 1 +1503us[+9103us] +/- 55ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 3 77 1 +1503us[+9103us] +/- 55ms");
result &= !test("^+ line-17532.dyn.kponet.fi 2 7 -377 1 +1503us[+9103us] +/- 55ms");
std::cout << "Test result = " << result << "\n";
return 0;
}
经过大量调试输出(第一次测试的例子):
Parsing: ^+ line-17532.dyn.kponet.fi 2 7 377 1 +1503us[+9103us] +/- 55ms
<log_line>
<try>^+ line-17532.dyn.kp</try>
<element_0>
<try>^+ line-17532.dyn.kp</try>
<success> line-17532.dyn.kpon</success>
<attributes>[[^, +]]</attributes>
</element_0>
<element_1>
<try>line-17532.dyn.kpone</try>
<success> 2 7 377 </success>
<attributes>[[l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i]]</attributes>
</element_1>
<element_2>
<try>2 7 377 1 </try>
<success> 7 377 1 +</success>
<attributes>[2]</attributes>
</element_2>
<element_3>
<try>7 377 1 +150</try>
<success> 377 1 +1503</success>
<attributes>[7]</attributes>
</element_3>
<element_4>
<try>377 1 +1503us[</try>
<success> 1 +1503us[+91</success>
<attributes>[377]</attributes>
</element_4>
<element_5>
<try>1 +1503us[+9103us]</try>
<success> +1503us[+9103us] </success>
<attributes>[1]</attributes>
</element_5>
<element_6>
<try>+1503us[+9103us] +/-</try>
<time_unit>
<try>us[+9103us] +/- 55</try>
<success>[+9103us] +/- 55ms</success>
<attributes>[[u, s]]</attributes>
</time_unit>
<success>[+9103us] +/- 55ms</success>
<attributes>[[+, 1, 5, 0, 3, u, s]]</attributes>
</element_6>
<element_7>
<try>+9103us] +/- 55ms</try>
<time_unit>
<try>us] +/- 55ms</try>
<success>] +/- 55ms</success>
<attributes>[[u, s]]</attributes>
</time_unit>
<success>] +/- 55ms</success>
<attributes>[[+, 9, 1, 0, 3, u, s]]</attributes>
</element_7>
<element_8>
<try>55ms</try>
<time_unit>
<try>ms</try>
<success></success>
<attributes>[[m, s]]</attributes>
</time_unit>
<success></success>
<attributes>[[5, 5, m, s]]</attributes>
</element_8>
<success></success>
<attributes>[[[^, +], [l, i, n, e, -, 1, 7, 5, 3, 2, ., d, y, n, ., k, p, o, n, e, t, ., f, i], 2, 7, 377, 1, [+, 1, 5, 0, 3, u, s], [+, 9, 1, 0, 3, u, s], [5, 5, m, s]]]</attributes>
</log_line>
-------------------------
Parsing succeeded
^+
line-17532.dyn.kponet.fi
2
7
377
1
+1503us
+9103us
55ms
程序打印以下行:
Test result = 1
这是我几乎可以对那些声称 C++ 只是增加了复杂性并且 C 确实更好的人表示同情的时候之一。它确实失去了一些东西,比如类型安全,但考虑一下用 C 的 scanf
:
struct record {
char prefix[256];
char url[256];
int a, b, c, d;
char time1[256];
char time2[256];
char time3[256];
};
sscanf(input,
"%255s %255s %d %d %d %d %255[^[][ %255[^]]] +/- %255s",
r.prefix, r.url, &r.a, &r.b, &r.c, &r.d, r.time1, r.time2, r.time3);
当然,这确实有一些潜在的责任:
- 它读入 char 数组而不是
std::string
s。 scanf
和表兄弟不是类型安全的。- 它不会尝试验证时间后缀。
- 基于 Spirit 的解析器可能很容易至少快一些。
如果这些中的任何一个对于您的目的来说确实是一个严重的问题,您可能真的需要一种不同的方法。考虑到代码 看起来 可能打算执行的操作,但它们中的任何一个都可能导致真正的问题并不是很明显。