使用 parsimmon 库解析基于缩进的语言
Parsing an indentation based language using parsimmon library
我的问题受到 this one 的启发,但对于 javascript,使用的是 parsimmon 解析器组合器库。我想解析缩进敏感的语言,例如 python 或 yaml.
我已经很容易地将那个答案中的 scala 示例转换为 javascript - 关键是 parsimmon 中的 chain
函数,它相当于 >>
运算符在 Scala 的解析器组合器中——它们都接受一个解析器和一个 returns 解析器的函数,第一个解析器的结果被传递给函数以选择下一个解析器。
但是,我不太清楚如何使这个递归。该示例适用于单个块 - 我看不到如何创建嵌套块,根据需要跟踪去凹痕级别以解析类似 python.
的内容
好吧,这是一种方法。它可能不是最好的,而且绝对不直观(我不确定我是否理解它为什么起作用,我 写了 它 :) 但它似乎非常强大。
基本上它是这样说的:tree
是一个 line
,后面跟着一个 block
。 block
又是 tree
的缩进列表。
indent
是一个采用当前缩进级别的函数,而 returns 是一个解析器,它期望一行缩进超过该级别。返回的解析器 returns 一堆以前的缩进级别。
我之前说过它很健壮 - 事实上,它 太 健壮了。它接受实际上应该抛出错误的输入:如果您取消缩进到与先前级别不匹配的缩进级别,它基本上 "rounds up" 到下一个缩进级别。我不确定修复的逻辑应该去哪里 - 相互递归与解析器混合 "chain"-ing 很难理解!
var {regex, seq} = Parsimmon;
function tree(state) {
return seq(
line,
block(state).atMost(1).map(x => x[0]? x[0] : [])
);
}
function block(state) {
return indent(state)
.chain(tree).atLeast(1);
}
function indent(state) {
return regex(/\s/).atLeast(state + 1)
.map(x => x.length)
.desc('indent');
}
let item = regex(/[^\s].*/).desc('item');
let line = item.skip(regex(/\n?/));
let start = block(-1);
let result = start.parse('top item 1\n sub item 1\n sub item 2\n' +
' even deeper\n sub item 3\ntop item 2');
console.log(JSON.stringify(result['value'], null, 2));
<script src="https://cdn.rawgit.com/jneen/parsimmon/master/src/parsimmon.js"></script>
为了解析嵌套块,您基本上必须预处理 输入脚本并在缩进增加或减少的地方插入一些预定义的特殊字符 INDENT / DEDENT,然后才解析规则应该应用。这些特殊字符等同于传统语言中的大括号 { 和 },因此预处理有效地将您的语言从基于缩进的语言转换为以大括号分隔的语言,用于解析过程的内部目的。
作为如何完成此操作的示例,您可以查看 Hypertag, which is an indentation-based language implemented in Python. In particular, you may want to see the preprocess() method and the grammar 规范 - 后者假定已经执行了预处理并且所有缩进都已替换为 INDENT_* 和 DEDENT_* 字符。
我是 Parsimmon 的维护者。我意识到这个问题真的很老,但我偶然发现它并想回答。
GitHub 上的 parsimmon 存储库中的 python-ish.js
示例应该可以帮助您了解如何解析基于缩进的语言。
这与 Josh 的回答非常相似,但我认为更容易理解一些。
https://github.com/jneen/parsimmon/blob/master/examples/python-ish.js
"use strict";
// Run me with Node to see my output!
let util = require("util");
let P = require("..");
///////////////////////////////////////////////////////////////////////
// Because parsing indentation-sensitive languages such as Python requires
// tracking state, all of our parsers are created inside a function that takes
// the current parsing state. In this case it's just the current indentation
// level, but a real Python parser would also *at least* need to keep track of
// whether the current parsing is inside of () or [] or {} so that you can know
// to ignore all whitespace, instead of further tracking indentation.
//
// Implementing all of Python's various whitespace requirements, including
// comments and line continuations (backslash at the end of the line) is left as
// an exercise for the reader. I've tried and frankly it's pretty tricky.
function PyX(indent) {
return P.createLanguage({
// This is where the magic happens. Basically we need to parse a deeper
// indentation level on the first statement of the block and keep track of
// new indentation level. Then we make a whole new set of parsers that use
// that new indentation level for all their parsing. Each line past the
// first is required to be indented to the same level as that new deeper
// indentation level.
Block: r =>
P.seqObj(
P.string("block:"),
r.NL,
["n", r.IndentMore],
["first", r.Statement]
).chain(args => {
const { n, first } = args;
return PyX(n)
.RestStatement.many()
.map(rest => ["BLOCK", first, ...rest]);
}),
// This is just a statement in our language. To simplify, this is either a
// block of code or just an identifier
Statement: r => P.alt(r.Block, r.Ident),
// This is a statement which is indented to the level of the current parse
// state. It's called RestStatement because the first statement in a block
// is indented more than the previous state, but the *rest* of the
// statements match up with the new state.
RestStatement: r => r.IndentSame.then(r.Statement),
// Just a variable and then the end of the line.
Ident: r => P.regexp(/[a-z]+/i).skip(r.End),
// Consume zero or more spaces and then return the number consumed. For a
// more Python-like language, this parser would also accept tabs and then
// expand them to the correct number of spaces
//
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
CountSpaces: () => P.regexp(/[ ]*/).map(s => s.length),
// Count the current indentation level and assert it's more than the current
// parse state's desired indentation
IndentSame: r =>
r.CountSpaces.chain(n => {
if (n === indent) {
return P.of(n);
}
return P.fail(`${n} spaces`);
}),
// Count the current indentation level and assert it's equal to the current
// parse state's desired indentation
IndentMore: r =>
r.CountSpaces.chain(n => {
if (n > indent) {
return P.of(n);
}
return P.fail(`more than ${n} spaces`);
}),
// Support all three standard text file line endings
NL: () => P.alt(P.string("\r\n"), P.oneOf("\r\n")),
// Lines should always end in a newline sequence, but many files are missing
// the final newline
End: r => P.alt(r.NL, P.eof)
});
}
// Start parsing at zero indentation
let Pythonish = PyX(0);
///////////////////////////////////////////////////////////////////////
let text = `\
block:
alpha
bravo
block:
charlie
delta
echo
block:
foxtrot
golf
`;
function prettyPrint(x) {
let opts = { depth: null, colors: "auto" };
let s = util.inspect(x, opts);
console.log(s);
}
let ast = Pythonish.Statement.tryParse(text);
prettyPrint(ast);
我的问题受到 this one 的启发,但对于 javascript,使用的是 parsimmon 解析器组合器库。我想解析缩进敏感的语言,例如 python 或 yaml.
我已经很容易地将那个答案中的 scala 示例转换为 javascript - 关键是 parsimmon 中的 chain
函数,它相当于 >>
运算符在 Scala 的解析器组合器中——它们都接受一个解析器和一个 returns 解析器的函数,第一个解析器的结果被传递给函数以选择下一个解析器。
但是,我不太清楚如何使这个递归。该示例适用于单个块 - 我看不到如何创建嵌套块,根据需要跟踪去凹痕级别以解析类似 python.
的内容好吧,这是一种方法。它可能不是最好的,而且绝对不直观(我不确定我是否理解它为什么起作用,我 写了 它 :) 但它似乎非常强大。
基本上它是这样说的:tree
是一个 line
,后面跟着一个 block
。 block
又是 tree
的缩进列表。
indent
是一个采用当前缩进级别的函数,而 returns 是一个解析器,它期望一行缩进超过该级别。返回的解析器 returns 一堆以前的缩进级别。
我之前说过它很健壮 - 事实上,它 太 健壮了。它接受实际上应该抛出错误的输入:如果您取消缩进到与先前级别不匹配的缩进级别,它基本上 "rounds up" 到下一个缩进级别。我不确定修复的逻辑应该去哪里 - 相互递归与解析器混合 "chain"-ing 很难理解!
var {regex, seq} = Parsimmon;
function tree(state) {
return seq(
line,
block(state).atMost(1).map(x => x[0]? x[0] : [])
);
}
function block(state) {
return indent(state)
.chain(tree).atLeast(1);
}
function indent(state) {
return regex(/\s/).atLeast(state + 1)
.map(x => x.length)
.desc('indent');
}
let item = regex(/[^\s].*/).desc('item');
let line = item.skip(regex(/\n?/));
let start = block(-1);
let result = start.parse('top item 1\n sub item 1\n sub item 2\n' +
' even deeper\n sub item 3\ntop item 2');
console.log(JSON.stringify(result['value'], null, 2));
<script src="https://cdn.rawgit.com/jneen/parsimmon/master/src/parsimmon.js"></script>
为了解析嵌套块,您基本上必须预处理 输入脚本并在缩进增加或减少的地方插入一些预定义的特殊字符 INDENT / DEDENT,然后才解析规则应该应用。这些特殊字符等同于传统语言中的大括号 { 和 },因此预处理有效地将您的语言从基于缩进的语言转换为以大括号分隔的语言,用于解析过程的内部目的。
作为如何完成此操作的示例,您可以查看 Hypertag, which is an indentation-based language implemented in Python. In particular, you may want to see the preprocess() method and the grammar 规范 - 后者假定已经执行了预处理并且所有缩进都已替换为 INDENT_* 和 DEDENT_* 字符。
我是 Parsimmon 的维护者。我意识到这个问题真的很老,但我偶然发现它并想回答。
GitHub 上的 parsimmon 存储库中的 python-ish.js
示例应该可以帮助您了解如何解析基于缩进的语言。
这与 Josh 的回答非常相似,但我认为更容易理解一些。
https://github.com/jneen/parsimmon/blob/master/examples/python-ish.js
"use strict";
// Run me with Node to see my output!
let util = require("util");
let P = require("..");
///////////////////////////////////////////////////////////////////////
// Because parsing indentation-sensitive languages such as Python requires
// tracking state, all of our parsers are created inside a function that takes
// the current parsing state. In this case it's just the current indentation
// level, but a real Python parser would also *at least* need to keep track of
// whether the current parsing is inside of () or [] or {} so that you can know
// to ignore all whitespace, instead of further tracking indentation.
//
// Implementing all of Python's various whitespace requirements, including
// comments and line continuations (backslash at the end of the line) is left as
// an exercise for the reader. I've tried and frankly it's pretty tricky.
function PyX(indent) {
return P.createLanguage({
// This is where the magic happens. Basically we need to parse a deeper
// indentation level on the first statement of the block and keep track of
// new indentation level. Then we make a whole new set of parsers that use
// that new indentation level for all their parsing. Each line past the
// first is required to be indented to the same level as that new deeper
// indentation level.
Block: r =>
P.seqObj(
P.string("block:"),
r.NL,
["n", r.IndentMore],
["first", r.Statement]
).chain(args => {
const { n, first } = args;
return PyX(n)
.RestStatement.many()
.map(rest => ["BLOCK", first, ...rest]);
}),
// This is just a statement in our language. To simplify, this is either a
// block of code or just an identifier
Statement: r => P.alt(r.Block, r.Ident),
// This is a statement which is indented to the level of the current parse
// state. It's called RestStatement because the first statement in a block
// is indented more than the previous state, but the *rest* of the
// statements match up with the new state.
RestStatement: r => r.IndentSame.then(r.Statement),
// Just a variable and then the end of the line.
Ident: r => P.regexp(/[a-z]+/i).skip(r.End),
// Consume zero or more spaces and then return the number consumed. For a
// more Python-like language, this parser would also accept tabs and then
// expand them to the correct number of spaces
//
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
CountSpaces: () => P.regexp(/[ ]*/).map(s => s.length),
// Count the current indentation level and assert it's more than the current
// parse state's desired indentation
IndentSame: r =>
r.CountSpaces.chain(n => {
if (n === indent) {
return P.of(n);
}
return P.fail(`${n} spaces`);
}),
// Count the current indentation level and assert it's equal to the current
// parse state's desired indentation
IndentMore: r =>
r.CountSpaces.chain(n => {
if (n > indent) {
return P.of(n);
}
return P.fail(`more than ${n} spaces`);
}),
// Support all three standard text file line endings
NL: () => P.alt(P.string("\r\n"), P.oneOf("\r\n")),
// Lines should always end in a newline sequence, but many files are missing
// the final newline
End: r => P.alt(r.NL, P.eof)
});
}
// Start parsing at zero indentation
let Pythonish = PyX(0);
///////////////////////////////////////////////////////////////////////
let text = `\
block:
alpha
bravo
block:
charlie
delta
echo
block:
foxtrot
golf
`;
function prettyPrint(x) {
let opts = { depth: null, colors: "auto" };
let s = util.inspect(x, opts);
console.log(s);
}
let ast = Pythonish.Statement.tryParse(text);
prettyPrint(ast);