Boost Spirit Qi语法添加到船长内部列表
Boost Spirit Qi grammar adding to list inside skipper
正在解析这些字符串:
int main(){
for (const std::string input: std::vector<std::string> {
"module simple_in_n_out();endmodule;",
"module simple_in_n_out(in_1);endmodule;",
"module simple_in_n_out(in_1,in_2,in_3);endmodule;",
})
{
parse_verilog_file(input);
}
return 0;
}
在前两个输入和第一个字符串 push_back 上成功,但在向向量添加更多字符串时失败:
std::string module_name;
stringvec module_inputs;
module_input_list %= tok.identifier[push_back(phoenix::ref(module_inputs), _1)] % qi::lit(',');
module_input_list.name("module_input_list");
BOOST_SPIRIT_DEBUG_NODE(module_input_list);
module_stmt
%= tok.module_ >> tok.identifier[phoenix::ref(module_name) = _1]
>> '(' >> -(module_input_list) >> ')'
>> ';';
module_stmt.name("module");
BOOST_SPIRIT_DEBUG_NODE(module_stmt);
输出如下:
<module_stmt>
<try>[module]</try>
<module_input_list>
<try>[)][;][endmodule][;]</try>
<fail/>
</module_input_list>
<success>[endmodule][;]</success>
<attributes>[]</attributes>
</module_stmt>
<module_stmt>
<try>[endmodule][;]</try>
<fail/>
</module_stmt>
TODO: put the module together now
<module_stmt>
<try></try>
<fail/>
</module_stmt>
-------------------------
Parsing succeeded
-------------------------
module name: simple_in_n_out
<module_stmt>
<try>[module]</try>
<module_input_list>
<try>[in_1][)][;][endmodule][;]</try>
<success>[)][;][endmodule][;]</success>
<attributes>[]</attributes>
</module_input_list>
<success>[endmodule][;]</success>
<attributes>[]</attributes>
</module_stmt>
<module_stmt>
<try>[endmodule][;]</try>
<fail/>
</module_stmt>
TODO: put the module together now
<module_stmt>
<try></try>
<fail/>
</module_stmt>
-------------------------
Parsing succeeded
-------------------------
module name: simple_in_n_out
module input: in_1
<module_stmt>
<try>[module]</try>
<module_input_list>
<try>[in_1]</try>
<success></success>
<attributes>[]</attributes>
</module_input_list>
<fail/>
</module_stmt>
-------------------------
Parsing failed
-------------------------
完整代码:
#define BOOST_SPIRIT_DEBUG
#include "netlist/netlistlexer.h"
namespace verilog {
using namespace boost::spirit;
using boost::phoenix::val;
using boost::spirit::ascii::char_;
using boost::spirit::ascii::string;
///////////////////////////////////////////////////////////////////////////////
// Grammar definition
///////////////////////////////////////////////////////////////////////////////
template <typename Iterator, typename Lexer>
struct verilog_grammar
: qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
{
template <typename TokenDef>
verilog_grammar(TokenDef const& tok)
: verilog_grammar::base_type(program)
{
using boost::spirit::_val;
using phoenix::push_back;
using qi::on_error;
using qi::fail;
using phoenix::construct;
program
= +statement
;
statement
= module_stmt
| end_module_stmt
;
module_input_list %= tok.identifier[push_back(phoenix::ref(module_inputs), _1)] % qi::lit(',');
module_input_list.name("module_input_list");
BOOST_SPIRIT_DEBUG_NODE(module_input_list);
module_stmt
%= tok.module_ >> tok.identifier[phoenix::ref(module_name) = _1]
>> '(' >> -(module_input_list) >> ')'
>> ';';
module_stmt.name("module");
BOOST_SPIRIT_DEBUG_NODE(module_stmt);
end_module_stmt
= (tok.endmodule_ >> ';' | tok.endmodule_)[
std::cout << val("TODO: put the module together now") << "\n"
];
end_module_stmt.name("end_module_stmt");
on_error<fail>
(
program
, std::cout
<< val("Error! Expecting ")
<< _4 // what failed?
<< val(" here: \"")
<< construct<std::string>(_3, _2) // iterators to error-pos, end
<< val("\"")
<< std::endl
);
}
std::string module_name;
stringvec module_inputs;
typedef boost::variant<unsigned int, std::string> expression_type;
typedef boost::fusion::vector<std::string,std::vector<std::string>> fustring;
qi::rule<Iterator, qi::in_state_skipper<Lexer> > program, statement;
qi::rule<Iterator, qi::in_state_skipper<Lexer> > module_stmt;
qi::rule<Iterator, qi::in_state_skipper<Lexer> > module_input_list;
qi::rule<Iterator, qi::in_state_skipper<Lexer> > end_module_stmt;
};
} // end verilog namespace
void parse_verilog_file(std::string str){
typedef std::string::iterator base_iterator_type;
using namespace boost::spirit;
typedef lex::lexertl::token<
base_iterator_type, boost::mpl::vector<unsigned int, std::string>
> token_type;
typedef lex::lexertl::lexer<token_type> lexer_type;
typedef verilog::verilog_tokens<lexer_type> verilog_tokens;
typedef verilog_tokens::iterator_type iterator_type;
typedef verilog::verilog_grammar<iterator_type, verilog_tokens::lexer_def> verilog_grammar;
verilog_tokens tokens; // Our lexer
verilog_grammar calc(tokens); // Our parser
std::string::iterator it = str.begin();
iterator_type iter = tokens.begin(it, str.end());
iterator_type end = tokens.end();
bool r = qi::phrase_parse(iter, end, calc, qi::in_state("WS")[tokens.self]);
if (r && iter == end)
{
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded\n";
std::cout << "-------------------------\n";
std::cout << "module name: " << calc.module_name << "\n";
for (const std::string i: calc.module_inputs){
std::cout << " module input: " << i << "\n";
}
}
else
{
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "-------------------------\n";
}
}
int main(){
for (const std::string input: std::vector<std::string> {
"module simple_in_n_out();endmodule;",
"module simple_in_n_out(in_1);endmodule;",
"module simple_in_n_out(in_1,in_2,in_3);endmodule;",
})
{
parse_verilog_file(input);
}
return 0;
}
netlist/netlistlexer.h:
#ifndef NETLISTLEXER_H
#define NETLISTLEXER_H
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/spirit/include/phoenix_object.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/variant/recursive_variant.hpp>
#include <boost/foreach.hpp>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
namespace fusion = boost::fusion;
namespace phoenix = boost::phoenix;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
typedef std::vector<std::string> stringvec;
namespace verilog {
using namespace boost::spirit;
using boost::phoenix::val;
using boost::spirit::ascii::char_;
using boost::spirit::ascii::string;
///////////////////////////////////////////////////////////////////////////////
// Token definition
///////////////////////////////////////////////////////////////////////////////
template <typename Lexer>
struct verilog_tokens : lex::lexer<Lexer>
{
verilog_tokens()
{
// define the tokens to match
identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
logic_op = "[\&\|]";
constant = "[0-9]+";
module_ = "module";
assign_ = "assign";
endmodule_ = "endmodule";
wire_ = "wire";
input_ = "input";
output_ = "output";
inout_ = "inout";
reg_ = "reg";
begin_ = "begin";
end_ = "end";
always_ = "always";
if_ = "if";
else_ = "else";
parameter_ = "parameter";
// associate the tokens and the token set with the lexer
this->self = lex::token_def<>('(') | ')' | '{' | '}' | '=' | '[' | ']' | ';' | constant | logic_op;
this->self += if_ | else_ | begin_ | end_ | always_ | reg_;
this->self += module_ | endmodule_ | assign_ | wire_ | input_ | output_ | inout_;
this->self += parameter_;
this->self += identifier;
// define the whitespace to ignore (spaces, tabs, newlines and C-style
// comments)
this->self("WS")
= lex::token_def<>("[ \t\n]+")
| "\/\*[^*]*\*+([^/*][^*]*\*+)*\/"
| "\/\/[^\r\n\f]*"
| "\(\*[^*]*\*\)"
;
}
// these tokens have no attribute
lex::token_def<lex::omit> if_, else_, begin_, end_, endmodule_;
// these tokens expose the iterator_range of the matched input sequence
lex::token_def<> always_, reg_;
lex::token_def<> module_, assign_, wire_, input_, output_, inout_;
lex::token_def<> parameter_;
// The following two tokens have an associated attribute type, 'identifier'
// carries a string (the identifier name) and 'constant' carries the
// matched integer value.
//
// Note: any token attribute type explicitly specified in a token_def<>
// declaration needs to be listed during token type definition as
// well (see the typedef for the token_type below).
//
// The conversion of the matched input to an instance of this type occurs
// once (on first access), which makes token attributes as efficient as
// possible. Moreover, token instances are constructed once by the lexer
// library. From this point on tokens are passed by reference only,
// avoiding them being copied around.
lex::token_def<std::string> identifier;
lex::token_def<unsigned int> constant;
lex::token_def<std::string> logic_op;
};
} // end verilog namespace
#endif // NETLISTLEXER_H
好吧,我不得不拨开 Spirit Lex¹ 的迷雾,以及一些表明您可能没有使用符合标准的编译器² 的怪癖。
When I did,我注意到实际的语法不使用属性传播,而是使用特殊的语义操作来提取一些信息³。
我已经公开表示,当您找到最佳点 时,我认为 Spirit 可以快速制作原型。基于语义动作的手动 AST 构建不是 IMO 所在的位置。
作为最后一条微妙的线索,我注意到您 "uselessly" 包括 recursive_variant.hpp
- 这让我觉得您实际上 希望 使用自动属性传播递归 AST?
第一个想法
我们以module_stmt
为例。代替 "abitrarily side-effecting" 进入 module_name
和 module_inputs
解析器成员变量,让我们使用 AST 类型:
namespace AST {
using identifiers = stringvec;
struct module {
std::string name;
identifiers inputs;
};
}
使其适应自动传播:
BOOST_FUSION_ADAPT_STRUCT(AST::module, name, inputs)
并依赖于它:
module_input_list = tok.identifier % ',';
module_stmt
= tok.module_ >> tok.identifier
>> '(' >> -module_input_list >> ')' >> ';'
>> tok.endmodule_ >> (';' | qi::eoi)
;
Note: I had to fix the module_
token definition to lex::omit
Note how I included endmodule_
into the rule because that's the natural thing to do. Any nested (recursive) rules (like nested statements
can just naturally go there and either synthesize into members of AST::module
规则声明可以是:
qi::rule<Iterator, AST::module(), Skipper> module_stmt;
qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
将其捆绑在一起
当然,现在顶级规则没有属性声明,所以神奇合成的AST::module
实例就消失了。这很不幸,但很容易修复。扩展我们的 AST 类型:
namespace AST {
using identifiers = stringvec;
struct module {
std::string name;
identifiers inputs;
};
using statement = boost::make_recursive_variant<
module // module_stmt
>::type;
using statements = std::vector<statement>;
struct program {
statements body;
};
}
这个相当简单的 Verilog 程序就可以了。我们扩展规则:
qi::rule<Iterator, AST::program(), Skipper> program;
qi::rule<Iterator, AST::statements(), Skipper> statements;
qi::rule<Iterator, AST::statement(), Skipper> statement;
qi::rule<Iterator, AST::module(), Skipper> module_stmt;
qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
您会注意到将规则与其对应的 AST 节点匹配的模式。规则本身不会改变:
program = statements;
statements = +statement;
statement = module_stmt;
module_input_list = tok.identifier % ',';
module_stmt
= tok.module_ >> tok.identifier
>> '(' >> -module_input_list >> ')' >> ';'
>> tok.endmodule_ >> (';' | qi::eoi)
;
Note: I introduced the statements
for consistency, and it also sidesteps a pitfall with propagating into single-element adapted fusion sequences⁴
现在我们可以将 AST::program
属性传递给解析器调用:
AST::program program;
if (qi::parse(iter, end, calc, program)) {
for (auto& stmt : program.body) {
if (auto* module = boost::get<AST::module>(&stmt)) {
std::cout << "module name: " << module->name << "\n";
for (std::string const& i : module->inputs) {
std::cout << " module input: " << i << "\n";
}
}
}
}
这将打印与预期相同的输出:
-------------------------
module simple_in_n_out();endmodule;
Parsing succeeded
module name: simple_in_n_out
-------------------------
-------------------------
module simple_in_n_out(in_1);endmodule;
Parsing succeeded
module name: simple_in_n_out
module input: in_1
-------------------------
-------------------------
module simple_in_n_out(in_1,in_2,in_3);endmodule;
Parsing failed
-------------------------
调试失败
取消注释 #define BOOST_SPIRIT_DEBUG
显示问题所在:
<program>
<try>[module]</try>
<statements>
<try>[module]</try>
<statement>
<try>[module]</try>
<module_stmt>
<try>[module]</try>
<module_input_list>
<try>[in_1]</try>
<success></success>
<attributes>[[[i, n, _, 1]]]</attributes>
</module_input_list>
<fail/>
</module_stmt>
<fail/>
</statement>
<fail/>
</statements>
<fail/>
</program>
问题不在于任何规则!它与 ','
不匹配。快速浏览令牌告诉我们原因:没有与逗号匹配的令牌... 匆忙添加它:
-------------------------
module simple_in_n_out();endmodule;
Parsing succeeded
module name: simple_in_n_out
-------------------------
-------------------------
module simple_in_n_out(in_1);endmodule;
Parsing succeeded
module name: simple_in_n_out
module input: in_1
-------------------------
-------------------------
module simple_in_n_out(in_1,in_2,in_3);endmodule;
Parsing succeeded
module name: simple_in_n_out
module input: in_1
module input: in_2
module input: in_3
-------------------------
奖金
然而,"problem" 有点突出了词法分析器的另一个成本因素(注意我之前提到的 module_
令牌的另一个问题)。所以这是没有 Lex 开销、没有 Phoenix 开销的全部内容,在一小部分代码中,具有完整的 AST 传播:
// #define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iomanip> // std::quoted
namespace qi = boost::spirit::qi;
namespace AST {
using identifier = std::string;
using identifiers = std::vector<identifier>;
struct module {
identifier name;
identifiers inputs;
};
using statement = boost::variant<module>;
using statements = std::vector<statement>;
struct program {
statements body;
};
}
BOOST_FUSION_ADAPT_STRUCT(AST::module, name, inputs)
BOOST_FUSION_ADAPT_STRUCT(AST::program, body)
namespace verilog {
template <typename Iterator> struct verilog_grammar : qi::grammar<Iterator, AST::program()> {
verilog_grammar() : verilog_grammar::base_type(start) {
auto kw = [](auto p) { return qi::copy(qi::lexeme[qi::no_case[p] >> !(qi::alnum|'_') ]); };
start = qi::skip(skipper.alias()) [ program ];
program = statements > qi::eoi;
statements = -statement % ';';
statement = module_stmt.alias();
module_input_list = identifier % ',';
module_stmt
= kw("module") >> identifier
>> '(' >> -module_input_list >> ')' >> ';'
>> kw("endmodule")
;
// lexemes
identifier = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z0-9_");
skipper = qi::char_(" \t\r\n") // added \r for consistency
| "//" >> *~qi::char_("\r\n\f")
| "/*" >> *(qi::char_ - "*/") >> "*/"
| "(*" >> *(qi::char_ - "*)") >> "*)"
;
BOOST_SPIRIT_DEBUG_NODES((program)(statements)(statement)(module_stmt)(module_input_list)(identifier));
}
private:
using Skipper = qi::rule<Iterator>;
qi::rule<Iterator, AST::program()> start;
Skipper skipper;
qi::rule<Iterator, AST::program(), Skipper> program;
qi::rule<Iterator, AST::statements(), Skipper> statements;
qi::rule<Iterator, AST::statement(), Skipper> statement;
qi::rule<Iterator, AST::module(), Skipper> module_stmt;
qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
// lexemes (formerly "tokens")
qi::rule<Iterator, AST::identifier()> identifier;
};
} // end verilog namespace
AST::program parse_verilog_file(std::string const& str) {
typedef std::string::const_iterator iterator;
static const verilog::verilog_grammar<iterator> grammar; // Our parser, now stateless
try {
AST::program program;
parse(str.begin(), str.end(), grammar, program);
return program;
} catch(qi::expectation_failure<iterator> const& ef) {
std::ostringstream msg;
msg << "Parsing failed: expected " << ef.what_ << " at " << std::quoted(std::string(ef.first, ef.last));
throw std::runtime_error(msg.str());
}
}
int main() {
for (const std::string input : std::vector<std::string>{
"module simple_in_n_out();endmodule;",
"module simple_in_n_out(in_1);endmodule;",
"module simple_in_n_out(in_1,in_2,in_3);endmodule;",
"module a();endmodule",
"module a();endmodule;oops",
})
try {
std::cout << "-------------------------\n";
std::cout << std::quoted(input) << "\n";
for (auto const& stmt : parse_verilog_file(input).body) {
if (auto* module = boost::get<AST::module>(&stmt)) {
std::cout << "module name: " << module->name << "\n";
for (std::string const& i : module->inputs) {
std::cout << " module input: " << i << "\n";
}
}
}
} catch(std::exception const& e) {
std::cout << e.what() << '\n';
}
}
正在打印
-------------------------
"module simple_in_n_out();endmodule;"
module name: simple_in_n_out
-------------------------
"module simple_in_n_out(in_1);endmodule;"
module name: simple_in_n_out
module input: in_1
-------------------------
"module simple_in_n_out(in_1,in_2,in_3);endmodule;"
module name: simple_in_n_out
module input: in_1
module input: in_2
module input: in_3
-------------------------
"module a();endmodule"
module name: a
-------------------------
"module a();endmodule;oops"
Parsing failed: expected <eoi> at "oops"
显着改进:
- "keyword boundaries" 的正确解析(这就是
kw()
助手的作用)。这意味着如果您有一个 以 开头的标识符,它可能是一个关键字,它不会被错误地标记为该关键字(原始的基于 Lex 的方法会发生这种情况)
- 关键字通常不区分大小写 (
qi::no_case[]
) - 仅用于演示
- 船长更容易指定,同时可读
我已经在这个答案的基于 Lex 的版本中做了一些事情:船长现在被封装在语法中。我认为,如果用户可能确实需要更改船长,则船长仅应由用户提供。在 99% 的情况下,船长与解析器紧密耦合,使用错误的船长无论如何都会破坏语法。
作为奖励,调用变得更加清晰:
AST::program program;
parse(str.begin(), str.end(), grammar, program);
return program;
- 注意到我是如何将
parse_verilog_file
函数简化为...成为一个函数(返回结果),将生成和处理结果分开
AST::program parse_verilog_file(std::string const& str) {
typedef std::string::const_iterator iterator;
static const verilog::verilog_grammar<iterator> grammar; // Our parser, now stateless
try {
AST::program program;
parse(str.begin(), str.end(), grammar, program);
return program;
} catch(qi::expectation_failure<iterator> const& ef) {
std::ostringstream msg;
msg << "Parsing failed: expected " << ef.what_ << " at " << std::quoted(std::string(ef.first, ef.last));
throw std::runtime_error(msg.str());
}
}
这反过来又通过捕获异常显示了简化的错误处理
反过来我用另一个期望点来替换 iter!=end
检查:
program = statements > qi::eoi;
这个,结合
statements = -statement % ';';
使 ';`` is required between statements, but not at the end of the program (which I _guess_ is what you wanted to convey with the old
endmodule` 规则)
Notice as well that -statement % ';'
makes it so that empty statements are acceptable. If that's not what you wanted, drop the '-
Note that the added test cases test and demonstrate the error detection/reporting for this logic ("module a();endmodule;oops"
results in Parsing failed: expected <eoi> at "oops"
)
任何 "tokens" 像 "identifier" 现在都是 "lexeme" 规则,因为它们不服从船长⁵期望:Live On Wandbox
<module_input_list>
<try>in_1,in_2,in_3);endm</try>
<identifier>
<try>in_1,in_2,in_3);endm</try>
<success>,in_2,in_3);endmodul</success>
<attributes>[[i, n, _, 1]]</attributes>
</identifier>
<identifier>
<try>in_2,in_3);endmodule</try>
<success>,in_3);endmodule;oop</success>
<attributes>[[i, n, _, 2]]</attributes>
</identifier>
<identifier>
<try>in_3);endmodule;oops</try>
<success>);endmodule;oops</success>
<attributes>[[i, n, _, 3]]</attributes>
</identifier>
<success>);endmodule;oops</success>
<attributes>[[[i, n, _, 1], [i, n, _, 2], [i, n, _, 3]]]</attributes>
</module_input_list>
哦,代码明显更短,同时做得更多:代码从 211 行减少到 112 行 (-47%)
- 它的编译速度明显加快(在我的系统上从 19.7 秒下降到 12.1 秒)
- 哦,鉴于目前的特征,可以进一步简化:This clocks in at 90 LoC。但是,我反而鼓励改进语法的功能,例如这里
- Boost.Qi rule with skipper does not match '.' character 还显示了一个 Verilog 解析器,并且有更多关于
kw()
工具的信息
- cannot get boost::spirit parser&lexer working for token types other than std::string or int or double 以类似的 Verilog 语法解析模块、学科、性质,并使用 Lex
¹ 轶事:"nobody uses that anymore"。我不这么说是因为我不知道 (see), and I'm not alone. From this 2017 answer:
using Lex makes most of the sweet-spot disappear since all "highlevel" parsers (like real_parser, [u]int_parser) are out the window. The Spirit devs are on record they prefer not to use Lex. Moreover, Spirit X3 doesn't have Lex support anymore.
² 我猜 MSVC,不是最新的?罪魁祸首是名字不明确,因为你使用 using namespace
.
³ Boost Spirit: "Semantic actions are evil"?
⁴ 在 SO 上看到很多答案:https://whosebug.com/search?q=user%3A85371+spirit+single-element
⁵ 看看 Boost spirit skipper issues 我对船长、规则声明和词素如何相互作用的描述
正在解析这些字符串:
int main(){
for (const std::string input: std::vector<std::string> {
"module simple_in_n_out();endmodule;",
"module simple_in_n_out(in_1);endmodule;",
"module simple_in_n_out(in_1,in_2,in_3);endmodule;",
})
{
parse_verilog_file(input);
}
return 0;
}
在前两个输入和第一个字符串 push_back 上成功,但在向向量添加更多字符串时失败:
std::string module_name;
stringvec module_inputs;
module_input_list %= tok.identifier[push_back(phoenix::ref(module_inputs), _1)] % qi::lit(',');
module_input_list.name("module_input_list");
BOOST_SPIRIT_DEBUG_NODE(module_input_list);
module_stmt
%= tok.module_ >> tok.identifier[phoenix::ref(module_name) = _1]
>> '(' >> -(module_input_list) >> ')'
>> ';';
module_stmt.name("module");
BOOST_SPIRIT_DEBUG_NODE(module_stmt);
输出如下:
<module_stmt>
<try>[module]</try>
<module_input_list>
<try>[)][;][endmodule][;]</try>
<fail/>
</module_input_list>
<success>[endmodule][;]</success>
<attributes>[]</attributes>
</module_stmt>
<module_stmt>
<try>[endmodule][;]</try>
<fail/>
</module_stmt>
TODO: put the module together now
<module_stmt>
<try></try>
<fail/>
</module_stmt>
-------------------------
Parsing succeeded
-------------------------
module name: simple_in_n_out
<module_stmt>
<try>[module]</try>
<module_input_list>
<try>[in_1][)][;][endmodule][;]</try>
<success>[)][;][endmodule][;]</success>
<attributes>[]</attributes>
</module_input_list>
<success>[endmodule][;]</success>
<attributes>[]</attributes>
</module_stmt>
<module_stmt>
<try>[endmodule][;]</try>
<fail/>
</module_stmt>
TODO: put the module together now
<module_stmt>
<try></try>
<fail/>
</module_stmt>
-------------------------
Parsing succeeded
-------------------------
module name: simple_in_n_out
module input: in_1
<module_stmt>
<try>[module]</try>
<module_input_list>
<try>[in_1]</try>
<success></success>
<attributes>[]</attributes>
</module_input_list>
<fail/>
</module_stmt>
-------------------------
Parsing failed
-------------------------
完整代码:
#define BOOST_SPIRIT_DEBUG
#include "netlist/netlistlexer.h"
namespace verilog {
using namespace boost::spirit;
using boost::phoenix::val;
using boost::spirit::ascii::char_;
using boost::spirit::ascii::string;
///////////////////////////////////////////////////////////////////////////////
// Grammar definition
///////////////////////////////////////////////////////////////////////////////
template <typename Iterator, typename Lexer>
struct verilog_grammar
: qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
{
template <typename TokenDef>
verilog_grammar(TokenDef const& tok)
: verilog_grammar::base_type(program)
{
using boost::spirit::_val;
using phoenix::push_back;
using qi::on_error;
using qi::fail;
using phoenix::construct;
program
= +statement
;
statement
= module_stmt
| end_module_stmt
;
module_input_list %= tok.identifier[push_back(phoenix::ref(module_inputs), _1)] % qi::lit(',');
module_input_list.name("module_input_list");
BOOST_SPIRIT_DEBUG_NODE(module_input_list);
module_stmt
%= tok.module_ >> tok.identifier[phoenix::ref(module_name) = _1]
>> '(' >> -(module_input_list) >> ')'
>> ';';
module_stmt.name("module");
BOOST_SPIRIT_DEBUG_NODE(module_stmt);
end_module_stmt
= (tok.endmodule_ >> ';' | tok.endmodule_)[
std::cout << val("TODO: put the module together now") << "\n"
];
end_module_stmt.name("end_module_stmt");
on_error<fail>
(
program
, std::cout
<< val("Error! Expecting ")
<< _4 // what failed?
<< val(" here: \"")
<< construct<std::string>(_3, _2) // iterators to error-pos, end
<< val("\"")
<< std::endl
);
}
std::string module_name;
stringvec module_inputs;
typedef boost::variant<unsigned int, std::string> expression_type;
typedef boost::fusion::vector<std::string,std::vector<std::string>> fustring;
qi::rule<Iterator, qi::in_state_skipper<Lexer> > program, statement;
qi::rule<Iterator, qi::in_state_skipper<Lexer> > module_stmt;
qi::rule<Iterator, qi::in_state_skipper<Lexer> > module_input_list;
qi::rule<Iterator, qi::in_state_skipper<Lexer> > end_module_stmt;
};
} // end verilog namespace
void parse_verilog_file(std::string str){
typedef std::string::iterator base_iterator_type;
using namespace boost::spirit;
typedef lex::lexertl::token<
base_iterator_type, boost::mpl::vector<unsigned int, std::string>
> token_type;
typedef lex::lexertl::lexer<token_type> lexer_type;
typedef verilog::verilog_tokens<lexer_type> verilog_tokens;
typedef verilog_tokens::iterator_type iterator_type;
typedef verilog::verilog_grammar<iterator_type, verilog_tokens::lexer_def> verilog_grammar;
verilog_tokens tokens; // Our lexer
verilog_grammar calc(tokens); // Our parser
std::string::iterator it = str.begin();
iterator_type iter = tokens.begin(it, str.end());
iterator_type end = tokens.end();
bool r = qi::phrase_parse(iter, end, calc, qi::in_state("WS")[tokens.self]);
if (r && iter == end)
{
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded\n";
std::cout << "-------------------------\n";
std::cout << "module name: " << calc.module_name << "\n";
for (const std::string i: calc.module_inputs){
std::cout << " module input: " << i << "\n";
}
}
else
{
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "-------------------------\n";
}
}
int main(){
for (const std::string input: std::vector<std::string> {
"module simple_in_n_out();endmodule;",
"module simple_in_n_out(in_1);endmodule;",
"module simple_in_n_out(in_1,in_2,in_3);endmodule;",
})
{
parse_verilog_file(input);
}
return 0;
}
netlist/netlistlexer.h:
#ifndef NETLISTLEXER_H
#define NETLISTLEXER_H
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/spirit/include/phoenix_object.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/variant/recursive_variant.hpp>
#include <boost/foreach.hpp>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
namespace fusion = boost::fusion;
namespace phoenix = boost::phoenix;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
typedef std::vector<std::string> stringvec;
namespace verilog {
using namespace boost::spirit;
using boost::phoenix::val;
using boost::spirit::ascii::char_;
using boost::spirit::ascii::string;
///////////////////////////////////////////////////////////////////////////////
// Token definition
///////////////////////////////////////////////////////////////////////////////
template <typename Lexer>
struct verilog_tokens : lex::lexer<Lexer>
{
verilog_tokens()
{
// define the tokens to match
identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
logic_op = "[\&\|]";
constant = "[0-9]+";
module_ = "module";
assign_ = "assign";
endmodule_ = "endmodule";
wire_ = "wire";
input_ = "input";
output_ = "output";
inout_ = "inout";
reg_ = "reg";
begin_ = "begin";
end_ = "end";
always_ = "always";
if_ = "if";
else_ = "else";
parameter_ = "parameter";
// associate the tokens and the token set with the lexer
this->self = lex::token_def<>('(') | ')' | '{' | '}' | '=' | '[' | ']' | ';' | constant | logic_op;
this->self += if_ | else_ | begin_ | end_ | always_ | reg_;
this->self += module_ | endmodule_ | assign_ | wire_ | input_ | output_ | inout_;
this->self += parameter_;
this->self += identifier;
// define the whitespace to ignore (spaces, tabs, newlines and C-style
// comments)
this->self("WS")
= lex::token_def<>("[ \t\n]+")
| "\/\*[^*]*\*+([^/*][^*]*\*+)*\/"
| "\/\/[^\r\n\f]*"
| "\(\*[^*]*\*\)"
;
}
// these tokens have no attribute
lex::token_def<lex::omit> if_, else_, begin_, end_, endmodule_;
// these tokens expose the iterator_range of the matched input sequence
lex::token_def<> always_, reg_;
lex::token_def<> module_, assign_, wire_, input_, output_, inout_;
lex::token_def<> parameter_;
// The following two tokens have an associated attribute type, 'identifier'
// carries a string (the identifier name) and 'constant' carries the
// matched integer value.
//
// Note: any token attribute type explicitly specified in a token_def<>
// declaration needs to be listed during token type definition as
// well (see the typedef for the token_type below).
//
// The conversion of the matched input to an instance of this type occurs
// once (on first access), which makes token attributes as efficient as
// possible. Moreover, token instances are constructed once by the lexer
// library. From this point on tokens are passed by reference only,
// avoiding them being copied around.
lex::token_def<std::string> identifier;
lex::token_def<unsigned int> constant;
lex::token_def<std::string> logic_op;
};
} // end verilog namespace
#endif // NETLISTLEXER_H
好吧,我不得不拨开 Spirit Lex¹ 的迷雾,以及一些表明您可能没有使用符合标准的编译器² 的怪癖。
When I did,我注意到实际的语法不使用属性传播,而是使用特殊的语义操作来提取一些信息³。
我已经公开表示,当您找到最佳点 时,我认为 Spirit 可以快速制作原型。基于语义动作的手动 AST 构建不是 IMO 所在的位置。
作为最后一条微妙的线索,我注意到您 "uselessly" 包括 recursive_variant.hpp
- 这让我觉得您实际上 希望 使用自动属性传播递归 AST?
第一个想法
我们以module_stmt
为例。代替 "abitrarily side-effecting" 进入 module_name
和 module_inputs
解析器成员变量,让我们使用 AST 类型:
namespace AST {
using identifiers = stringvec;
struct module {
std::string name;
identifiers inputs;
};
}
使其适应自动传播:
BOOST_FUSION_ADAPT_STRUCT(AST::module, name, inputs)
并依赖于它:
module_input_list = tok.identifier % ',';
module_stmt
= tok.module_ >> tok.identifier
>> '(' >> -module_input_list >> ')' >> ';'
>> tok.endmodule_ >> (';' | qi::eoi)
;
Note: I had to fix the
module_
token definition tolex::omit
Note how I included
endmodule_
into the rule because that's the natural thing to do. Any nested (recursive) rules (like nestedstatements
can just naturally go there and either synthesize into members ofAST::module
规则声明可以是:
qi::rule<Iterator, AST::module(), Skipper> module_stmt;
qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
将其捆绑在一起
当然,现在顶级规则没有属性声明,所以神奇合成的AST::module
实例就消失了。这很不幸,但很容易修复。扩展我们的 AST 类型:
namespace AST {
using identifiers = stringvec;
struct module {
std::string name;
identifiers inputs;
};
using statement = boost::make_recursive_variant<
module // module_stmt
>::type;
using statements = std::vector<statement>;
struct program {
statements body;
};
}
这个相当简单的 Verilog 程序就可以了。我们扩展规则:
qi::rule<Iterator, AST::program(), Skipper> program;
qi::rule<Iterator, AST::statements(), Skipper> statements;
qi::rule<Iterator, AST::statement(), Skipper> statement;
qi::rule<Iterator, AST::module(), Skipper> module_stmt;
qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
您会注意到将规则与其对应的 AST 节点匹配的模式。规则本身不会改变:
program = statements;
statements = +statement;
statement = module_stmt;
module_input_list = tok.identifier % ',';
module_stmt
= tok.module_ >> tok.identifier
>> '(' >> -module_input_list >> ')' >> ';'
>> tok.endmodule_ >> (';' | qi::eoi)
;
Note: I introduced the
statements
for consistency, and it also sidesteps a pitfall with propagating into single-element adapted fusion sequences⁴
现在我们可以将 AST::program
属性传递给解析器调用:
AST::program program;
if (qi::parse(iter, end, calc, program)) {
for (auto& stmt : program.body) {
if (auto* module = boost::get<AST::module>(&stmt)) {
std::cout << "module name: " << module->name << "\n";
for (std::string const& i : module->inputs) {
std::cout << " module input: " << i << "\n";
}
}
}
}
这将打印与预期相同的输出:
-------------------------
module simple_in_n_out();endmodule;
Parsing succeeded
module name: simple_in_n_out
-------------------------
-------------------------
module simple_in_n_out(in_1);endmodule;
Parsing succeeded
module name: simple_in_n_out
module input: in_1
-------------------------
-------------------------
module simple_in_n_out(in_1,in_2,in_3);endmodule;
Parsing failed
-------------------------
调试失败
取消注释 #define BOOST_SPIRIT_DEBUG
显示问题所在:
<program>
<try>[module]</try>
<statements>
<try>[module]</try>
<statement>
<try>[module]</try>
<module_stmt>
<try>[module]</try>
<module_input_list>
<try>[in_1]</try>
<success></success>
<attributes>[[[i, n, _, 1]]]</attributes>
</module_input_list>
<fail/>
</module_stmt>
<fail/>
</statement>
<fail/>
</statements>
<fail/>
</program>
问题不在于任何规则!它与 ','
不匹配。快速浏览令牌告诉我们原因:没有与逗号匹配的令牌... 匆忙添加它:
-------------------------
module simple_in_n_out();endmodule;
Parsing succeeded
module name: simple_in_n_out
-------------------------
-------------------------
module simple_in_n_out(in_1);endmodule;
Parsing succeeded
module name: simple_in_n_out
module input: in_1
-------------------------
-------------------------
module simple_in_n_out(in_1,in_2,in_3);endmodule;
Parsing succeeded
module name: simple_in_n_out
module input: in_1
module input: in_2
module input: in_3
-------------------------
奖金
然而,"problem" 有点突出了词法分析器的另一个成本因素(注意我之前提到的 module_
令牌的另一个问题)。所以这是没有 Lex 开销、没有 Phoenix 开销的全部内容,在一小部分代码中,具有完整的 AST 传播:
// #define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iomanip> // std::quoted
namespace qi = boost::spirit::qi;
namespace AST {
using identifier = std::string;
using identifiers = std::vector<identifier>;
struct module {
identifier name;
identifiers inputs;
};
using statement = boost::variant<module>;
using statements = std::vector<statement>;
struct program {
statements body;
};
}
BOOST_FUSION_ADAPT_STRUCT(AST::module, name, inputs)
BOOST_FUSION_ADAPT_STRUCT(AST::program, body)
namespace verilog {
template <typename Iterator> struct verilog_grammar : qi::grammar<Iterator, AST::program()> {
verilog_grammar() : verilog_grammar::base_type(start) {
auto kw = [](auto p) { return qi::copy(qi::lexeme[qi::no_case[p] >> !(qi::alnum|'_') ]); };
start = qi::skip(skipper.alias()) [ program ];
program = statements > qi::eoi;
statements = -statement % ';';
statement = module_stmt.alias();
module_input_list = identifier % ',';
module_stmt
= kw("module") >> identifier
>> '(' >> -module_input_list >> ')' >> ';'
>> kw("endmodule")
;
// lexemes
identifier = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z0-9_");
skipper = qi::char_(" \t\r\n") // added \r for consistency
| "//" >> *~qi::char_("\r\n\f")
| "/*" >> *(qi::char_ - "*/") >> "*/"
| "(*" >> *(qi::char_ - "*)") >> "*)"
;
BOOST_SPIRIT_DEBUG_NODES((program)(statements)(statement)(module_stmt)(module_input_list)(identifier));
}
private:
using Skipper = qi::rule<Iterator>;
qi::rule<Iterator, AST::program()> start;
Skipper skipper;
qi::rule<Iterator, AST::program(), Skipper> program;
qi::rule<Iterator, AST::statements(), Skipper> statements;
qi::rule<Iterator, AST::statement(), Skipper> statement;
qi::rule<Iterator, AST::module(), Skipper> module_stmt;
qi::rule<Iterator, AST::identifiers(), Skipper> module_input_list;
// lexemes (formerly "tokens")
qi::rule<Iterator, AST::identifier()> identifier;
};
} // end verilog namespace
AST::program parse_verilog_file(std::string const& str) {
typedef std::string::const_iterator iterator;
static const verilog::verilog_grammar<iterator> grammar; // Our parser, now stateless
try {
AST::program program;
parse(str.begin(), str.end(), grammar, program);
return program;
} catch(qi::expectation_failure<iterator> const& ef) {
std::ostringstream msg;
msg << "Parsing failed: expected " << ef.what_ << " at " << std::quoted(std::string(ef.first, ef.last));
throw std::runtime_error(msg.str());
}
}
int main() {
for (const std::string input : std::vector<std::string>{
"module simple_in_n_out();endmodule;",
"module simple_in_n_out(in_1);endmodule;",
"module simple_in_n_out(in_1,in_2,in_3);endmodule;",
"module a();endmodule",
"module a();endmodule;oops",
})
try {
std::cout << "-------------------------\n";
std::cout << std::quoted(input) << "\n";
for (auto const& stmt : parse_verilog_file(input).body) {
if (auto* module = boost::get<AST::module>(&stmt)) {
std::cout << "module name: " << module->name << "\n";
for (std::string const& i : module->inputs) {
std::cout << " module input: " << i << "\n";
}
}
}
} catch(std::exception const& e) {
std::cout << e.what() << '\n';
}
}
正在打印
-------------------------
"module simple_in_n_out();endmodule;"
module name: simple_in_n_out
-------------------------
"module simple_in_n_out(in_1);endmodule;"
module name: simple_in_n_out
module input: in_1
-------------------------
"module simple_in_n_out(in_1,in_2,in_3);endmodule;"
module name: simple_in_n_out
module input: in_1
module input: in_2
module input: in_3
-------------------------
"module a();endmodule"
module name: a
-------------------------
"module a();endmodule;oops"
Parsing failed: expected <eoi> at "oops"
显着改进:
- "keyword boundaries" 的正确解析(这就是
kw()
助手的作用)。这意味着如果您有一个 以 开头的标识符,它可能是一个关键字,它不会被错误地标记为该关键字(原始的基于 Lex 的方法会发生这种情况) - 关键字通常不区分大小写 (
qi::no_case[]
) - 仅用于演示 - 船长更容易指定,同时可读
我已经在这个答案的基于 Lex 的版本中做了一些事情:船长现在被封装在语法中。我认为,如果用户可能确实需要更改船长,则船长仅应由用户提供。在 99% 的情况下,船长与解析器紧密耦合,使用错误的船长无论如何都会破坏语法。
作为奖励,调用变得更加清晰:
AST::program program;
parse(str.begin(), str.end(), grammar, program);
return program;
- 注意到我是如何将
parse_verilog_file
函数简化为...成为一个函数(返回结果),将生成和处理结果分开
AST::program parse_verilog_file(std::string const& str) {
typedef std::string::const_iterator iterator;
static const verilog::verilog_grammar<iterator> grammar; // Our parser, now stateless
try {
AST::program program;
parse(str.begin(), str.end(), grammar, program);
return program;
} catch(qi::expectation_failure<iterator> const& ef) {
std::ostringstream msg;
msg << "Parsing failed: expected " << ef.what_ << " at " << std::quoted(std::string(ef.first, ef.last));
throw std::runtime_error(msg.str());
}
}
这反过来又通过捕获异常显示了简化的错误处理
反过来我用另一个期望点来替换
iter!=end
检查:
program = statements > qi::eoi;
这个,结合
statements = -statement % ';';
使 ';`` is required between statements, but not at the end of the program (which I _guess_ is what you wanted to convey with the old
endmodule` 规则)
Notice as well that
-statement % ';'
makes it so that empty statements are acceptable. If that's not what you wanted, drop the'-
Note that the added test cases test and demonstrate the error detection/reporting for this logic (
"module a();endmodule;oops"
results inParsing failed: expected <eoi> at "oops"
)
任何 "tokens" 像 "identifier" 现在都是 "lexeme" 规则,因为它们不服从船长⁵期望:Live On Wandbox
<module_input_list> <try>in_1,in_2,in_3);endm</try> <identifier> <try>in_1,in_2,in_3);endm</try> <success>,in_2,in_3);endmodul</success> <attributes>[[i, n, _, 1]]</attributes> </identifier> <identifier> <try>in_2,in_3);endmodule</try> <success>,in_3);endmodule;oop</success> <attributes>[[i, n, _, 2]]</attributes> </identifier> <identifier> <try>in_3);endmodule;oops</try> <success>);endmodule;oops</success> <attributes>[[i, n, _, 3]]</attributes> </identifier> <success>);endmodule;oops</success> <attributes>[[[i, n, _, 1], [i, n, _, 2], [i, n, _, 3]]]</attributes> </module_input_list>
哦,代码明显更短,同时做得更多:代码从 211 行减少到 112 行 (-47%)
- 它的编译速度明显加快(在我的系统上从 19.7 秒下降到 12.1 秒)
- 哦,鉴于目前的特征,可以进一步简化:This clocks in at 90 LoC。但是,我反而鼓励改进语法的功能,例如这里
- Boost.Qi rule with skipper does not match '.' character 还显示了一个 Verilog 解析器,并且有更多关于
kw()
工具的信息 - cannot get boost::spirit parser&lexer working for token types other than std::string or int or double 以类似的 Verilog 语法解析模块、学科、性质,并使用 Lex
- Boost.Qi rule with skipper does not match '.' character 还显示了一个 Verilog 解析器,并且有更多关于
¹ 轶事:"nobody uses that anymore"。我不这么说是因为我不知道 (see), and I'm not alone. From this 2017 answer:
using Lex makes most of the sweet-spot disappear since all "highlevel" parsers (like real_parser, [u]int_parser) are out the window. The Spirit devs are on record they prefer not to use Lex. Moreover, Spirit X3 doesn't have Lex support anymore.
² 我猜 MSVC,不是最新的?罪魁祸首是名字不明确,因为你使用 using namespace
.
³ Boost Spirit: "Semantic actions are evil"?
⁴ 在 SO 上看到很多答案:https://whosebug.com/search?q=user%3A85371+spirit+single-element
⁵ 看看 Boost spirit skipper issues 我对船长、规则声明和词素如何相互作用的描述