在解析括号表达式时提升精神核心转储
Boost spirit core dump on parsing bracketed expression
有一些应该解析终端文字序列的简化语法:id、'<'、'>' 和“:action”。
我需要允许括号 '(' ')' 只做提高阅读。 (完整的例子有 http://coliru.stacked-crooked.com/a/dca93f5c8f37a889 )
我的语法片段:
start = expression % eol;
expression = (simple_def >> -expression)
| (qi::lit('(') > expression > ')');
simple_def = qi::lit('<') [qi::_val = Command::left]
| qi::lit('>') [qi::_val = Command::right]
| key [qi::_val = Command::id]
| qi::lit(":action") [qi::_val = Command::action]
;
key = +qi::char_("a-zA-Z_0-9");
当我尝试解析时:const std::string s = "(a1 > :action)";
一切都很顺利。
但是当我用括号 "(a1 (>) :action)"
带来更多复杂性时,我得到了 coredump。仅供参考 - coredump 发生在 coliru,而 msvc 编译的示例仅演示失败解析。
所以我的问题是:(1) 括号有什么问题,(2) 如何将括号引入表达式。
p.s。它是简化的语法,实际上我有更复杂的情况,但这是一个最小的可重现代码。
你应该只处理期望失败:
terminate called after throwing an instance of 'boost::wrapexcept<boost::spir
it::qi::expectation_failure<__gnu_cxx::__normal_iterator<char const*, std::__
cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >
>'
what(): boost::spirit::qi::expectation_failure
Aborted (core dumped)
如果您处理预期失败,程序将不必终止。
修正语法
您的 'nested expression' 规则只接受一个表达式。我认为
expression = (simple_def >> -expression)
旨在匹配“1 个或多个 `simple_def”。但是,替代分支:
| ('(' > expression > ')');
不接受相同的:它只是在解析 `)' 后停止。这意味着根据语法,您的输入完全无效。
我建议通过表达意图来简化。您在语义类型定义方面走在了正确的道路上。让我们避免“狡猾地” Line Of Lines(甚至 是 是什么?):
using Id = std::string;
using Line = std::vector<Command>;
using Script = std::vector<Line>;
并始终如一地使用这些 typedef。现在,我们可以表达我们“思考”的语法:
start = skip(blank)[script];
script = line % eol;
line = +simple;
simple = group | command;
group = '(' > line > ')';
看,通过简化我们的心智模型并坚持下去,我们避免了您很难发现的整个问题。
这是一个快速演示,其中包括错误处理、可选的调试输出、测试用例和封装船长,因为它是语法的一部分:Live On Compiler Explorer
#include <fmt/ranges.h>
#include <fmt/ostream.h>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
enum class Command { id, left, right, action };
static inline std::ostream& operator<<(std::ostream& os, Command cmd) {
switch (cmd) {
case Command::id: return os << "[ID]";
case Command::left: return os << "[LEFT]";
case Command::right: return os << "[RIGHT]";
case Command::action: return os << "[ACTION]";
}
return os << "[???]";
}
using Id = std::string;
using Line = std::vector<Command>;
using Script = std::vector<Line>;
template <typename It>
struct ExprGrammar : qi::grammar<It, Script()> {
ExprGrammar() : ExprGrammar::base_type(start) {
using namespace qi;
start = skip(blank)[script];
script = line % eol;
line = +simple;
simple = group | command;
group = '(' > line > ')';
command =
lit('<') [ _val = Command::left ] |
lit('>') [ _val = Command::right ] |
key [ _val = Command::id ] |
lit(":action") [ _val = Command::action ] ;
key = +char_("a-zA-Z_0-9");
BOOST_SPIRIT_DEBUG_NODES((command)(line)(simple)(group)(script)(key));
}
private:
qi::rule<It, Script()> start;
qi::rule<It, Line(), qi::blank_type> line, simple, group;
qi::rule<It, Script(), qi::blank_type> script;
qi::rule<It, Command(), qi::blank_type> command;
// lexemes
qi::rule<It, Id()> key;
};
int main() {
using It = std::string::const_iterator;
ExprGrammar<It> const p;
for (const std::string s : {
"a1 > :action\na1 (>) :action",
"(a1 > :action)\n(a1 (>) :action)",
"a1 (> :action)",
}) {
It f(begin(s)), l(end(s));
try {
Script parsed;
bool ok = qi::parse(f, l, p, parsed);
if (ok) {
fmt::print("Parsed {}\n", parsed);
} else {
fmt::print("Parsed failed\n");
}
if (f != l) {
fmt::print("Remaining unparsed: '{}'\n", std::string(f, l));
}
} catch (qi::expectation_failure<It> const& ef) {
fmt::print("{}\n", ef.what()); // TODO add more details :)
}
}
}
版画
Parsed {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}}
Parsed {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}}
Parsed {{[ID], [RIGHT], [ACTION]}}
奖金
不过,我认为使用 qi::symbols
命令可以大大简化这一切。事实上,看起来你只是在标记化(当你说括号不重要时你确认了这一点)。
line = +simple;
simple = group | command | (omit[key] >> attr(Command::id));
group = '(' > line > ')';
key = +char_("a-zA-Z_0-9");
现在你根本不需要 Phoenix:Live On Compiler Explorer,打印
ok? true {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}}
ok? true {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}}
ok? true {{[ID], [RIGHT], [ACTION]}}
更简单?
因为我观察到您基本上是按行标记化,为什么不简单地跳过括号,并一直简化为:
script = line % eol;
line = *(command | omit[key] >> attr(Command::id));
那是 全部。再看Live On Compiler Explorer:
#include <boost/spirit/include/qi.hpp>
#include <fmt/ostream.h>
#include <fmt/ranges.h>
namespace qi = boost::spirit::qi;
enum class Command { id, left, right, action };
using Id = std::string;
using Line = std::vector<Command>;
using Script = std::vector<Line>;
static inline std::ostream& operator<<(std::ostream& os, Command cmd) {
return os << (std::array{"ID", "LEFT", "RIGHT", "ACTION"}.at(int(cmd)));
}
template <typename It>
struct ExprGrammar : qi::grammar<It, Script()> {
ExprGrammar() : ExprGrammar::base_type(start) {
using namespace qi;
start = skip(skipper.alias())[line % eol];
line = *(command | omit[key] >> attr(Command::id));
key = +char_("a-zA-Z_0-9");
BOOST_SPIRIT_DEBUG_NODES((line)(key));
}
private:
using Skipper = qi::rule<It>;
qi::rule<It, Script()> start;
qi::rule<It, Line(), Skipper> line;
Skipper skipper = qi::char_(" \t\b\f()");
qi::rule<It /*, Id()*/> key; // omit attribute for efficiency
struct cmdsym : qi::symbols<char, Command> {
cmdsym() { this->add("<", Command::left)
(">", Command::right)
(":action", Command::action);
}
} command;
};
int main() {
using It = std::string::const_iterator;
ExprGrammar<It> const p;
for (const std::string s : {
"a1 > :action\na1 (>) :action",
"(a1 > :action)\n(a1 (>) :action)",
"a1 (> :action)",
})
try {
It f(begin(s)), l(end(s));
Script parsed;
bool ok = qi::parse(f, l, p, parsed);
fmt::print("ok? {} {}\n", ok, parsed);
if (f != l)
fmt::print(" -- Remaining '{}'\n", std::string(f, l));
} catch (qi::expectation_failure<It> const& ef) {
fmt::print("{}\n", ef.what()); // TODO add more details :)
}
}
版画
ok? true {{ID, RIGHT, ACTION}, {ID, RIGHT, ACTION}}
ok? true {{ID, RIGHT, ACTION}, {ID, RIGHT, ACTION}}
ok? true {{ID, RIGHT, ACTION}}
Note I very subtly changed +() to *() so it would accept empty lines as well. This may or may not be what you want
有一些应该解析终端文字序列的简化语法:id、'<'、'>' 和“:action”。 我需要允许括号 '(' ')' 只做提高阅读。 (完整的例子有 http://coliru.stacked-crooked.com/a/dca93f5c8f37a889 ) 我的语法片段:
start = expression % eol;
expression = (simple_def >> -expression)
| (qi::lit('(') > expression > ')');
simple_def = qi::lit('<') [qi::_val = Command::left]
| qi::lit('>') [qi::_val = Command::right]
| key [qi::_val = Command::id]
| qi::lit(":action") [qi::_val = Command::action]
;
key = +qi::char_("a-zA-Z_0-9");
当我尝试解析时:const std::string s = "(a1 > :action)";
一切都很顺利。
但是当我用括号 "(a1 (>) :action)"
带来更多复杂性时,我得到了 coredump。仅供参考 - coredump 发生在 coliru,而 msvc 编译的示例仅演示失败解析。
所以我的问题是:(1) 括号有什么问题,(2) 如何将括号引入表达式。
p.s。它是简化的语法,实际上我有更复杂的情况,但这是一个最小的可重现代码。
你应该只处理期望失败:
terminate called after throwing an instance of 'boost::wrapexcept<boost::spir
it::qi::expectation_failure<__gnu_cxx::__normal_iterator<char const*, std::__
cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >
>'
what(): boost::spirit::qi::expectation_failure
Aborted (core dumped)
如果您处理预期失败,程序将不必终止。
修正语法
您的 'nested expression' 规则只接受一个表达式。我认为
expression = (simple_def >> -expression)
旨在匹配“1 个或多个 `simple_def”。但是,替代分支:
| ('(' > expression > ')');
不接受相同的:它只是在解析 `)' 后停止。这意味着根据语法,您的输入完全无效。
我建议通过表达意图来简化。您在语义类型定义方面走在了正确的道路上。让我们避免“狡猾地” Line Of Lines(甚至 是 是什么?):
using Id = std::string;
using Line = std::vector<Command>;
using Script = std::vector<Line>;
并始终如一地使用这些 typedef。现在,我们可以表达我们“思考”的语法:
start = skip(blank)[script];
script = line % eol;
line = +simple;
simple = group | command;
group = '(' > line > ')';
看,通过简化我们的心智模型并坚持下去,我们避免了您很难发现的整个问题。
这是一个快速演示,其中包括错误处理、可选的调试输出、测试用例和封装船长,因为它是语法的一部分:Live On Compiler Explorer
#include <fmt/ranges.h>
#include <fmt/ostream.h>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
enum class Command { id, left, right, action };
static inline std::ostream& operator<<(std::ostream& os, Command cmd) {
switch (cmd) {
case Command::id: return os << "[ID]";
case Command::left: return os << "[LEFT]";
case Command::right: return os << "[RIGHT]";
case Command::action: return os << "[ACTION]";
}
return os << "[???]";
}
using Id = std::string;
using Line = std::vector<Command>;
using Script = std::vector<Line>;
template <typename It>
struct ExprGrammar : qi::grammar<It, Script()> {
ExprGrammar() : ExprGrammar::base_type(start) {
using namespace qi;
start = skip(blank)[script];
script = line % eol;
line = +simple;
simple = group | command;
group = '(' > line > ')';
command =
lit('<') [ _val = Command::left ] |
lit('>') [ _val = Command::right ] |
key [ _val = Command::id ] |
lit(":action") [ _val = Command::action ] ;
key = +char_("a-zA-Z_0-9");
BOOST_SPIRIT_DEBUG_NODES((command)(line)(simple)(group)(script)(key));
}
private:
qi::rule<It, Script()> start;
qi::rule<It, Line(), qi::blank_type> line, simple, group;
qi::rule<It, Script(), qi::blank_type> script;
qi::rule<It, Command(), qi::blank_type> command;
// lexemes
qi::rule<It, Id()> key;
};
int main() {
using It = std::string::const_iterator;
ExprGrammar<It> const p;
for (const std::string s : {
"a1 > :action\na1 (>) :action",
"(a1 > :action)\n(a1 (>) :action)",
"a1 (> :action)",
}) {
It f(begin(s)), l(end(s));
try {
Script parsed;
bool ok = qi::parse(f, l, p, parsed);
if (ok) {
fmt::print("Parsed {}\n", parsed);
} else {
fmt::print("Parsed failed\n");
}
if (f != l) {
fmt::print("Remaining unparsed: '{}'\n", std::string(f, l));
}
} catch (qi::expectation_failure<It> const& ef) {
fmt::print("{}\n", ef.what()); // TODO add more details :)
}
}
}
版画
Parsed {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}}
Parsed {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}}
Parsed {{[ID], [RIGHT], [ACTION]}}
奖金
不过,我认为使用 qi::symbols
命令可以大大简化这一切。事实上,看起来你只是在标记化(当你说括号不重要时你确认了这一点)。
line = +simple;
simple = group | command | (omit[key] >> attr(Command::id));
group = '(' > line > ')';
key = +char_("a-zA-Z_0-9");
现在你根本不需要 Phoenix:Live On Compiler Explorer,打印
ok? true {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}}
ok? true {{[ID], [RIGHT], [ACTION]}, {[ID], [RIGHT], [ACTION]}}
ok? true {{[ID], [RIGHT], [ACTION]}}
更简单?
因为我观察到您基本上是按行标记化,为什么不简单地跳过括号,并一直简化为:
script = line % eol;
line = *(command | omit[key] >> attr(Command::id));
那是 全部。再看Live On Compiler Explorer:
#include <boost/spirit/include/qi.hpp>
#include <fmt/ostream.h>
#include <fmt/ranges.h>
namespace qi = boost::spirit::qi;
enum class Command { id, left, right, action };
using Id = std::string;
using Line = std::vector<Command>;
using Script = std::vector<Line>;
static inline std::ostream& operator<<(std::ostream& os, Command cmd) {
return os << (std::array{"ID", "LEFT", "RIGHT", "ACTION"}.at(int(cmd)));
}
template <typename It>
struct ExprGrammar : qi::grammar<It, Script()> {
ExprGrammar() : ExprGrammar::base_type(start) {
using namespace qi;
start = skip(skipper.alias())[line % eol];
line = *(command | omit[key] >> attr(Command::id));
key = +char_("a-zA-Z_0-9");
BOOST_SPIRIT_DEBUG_NODES((line)(key));
}
private:
using Skipper = qi::rule<It>;
qi::rule<It, Script()> start;
qi::rule<It, Line(), Skipper> line;
Skipper skipper = qi::char_(" \t\b\f()");
qi::rule<It /*, Id()*/> key; // omit attribute for efficiency
struct cmdsym : qi::symbols<char, Command> {
cmdsym() { this->add("<", Command::left)
(">", Command::right)
(":action", Command::action);
}
} command;
};
int main() {
using It = std::string::const_iterator;
ExprGrammar<It> const p;
for (const std::string s : {
"a1 > :action\na1 (>) :action",
"(a1 > :action)\n(a1 (>) :action)",
"a1 (> :action)",
})
try {
It f(begin(s)), l(end(s));
Script parsed;
bool ok = qi::parse(f, l, p, parsed);
fmt::print("ok? {} {}\n", ok, parsed);
if (f != l)
fmt::print(" -- Remaining '{}'\n", std::string(f, l));
} catch (qi::expectation_failure<It> const& ef) {
fmt::print("{}\n", ef.what()); // TODO add more details :)
}
}
版画
ok? true {{ID, RIGHT, ACTION}, {ID, RIGHT, ACTION}}
ok? true {{ID, RIGHT, ACTION}, {ID, RIGHT, ACTION}}
ok? true {{ID, RIGHT, ACTION}}
Note I very subtly changed +() to *() so it would accept empty lines as well. This may or may not be what you want