使用 htmlparser2 解析 xml 以提取特定标签的文本
parsing xml to extract text of a specific tag using htmlparser2
我正在试用 node-htmlparser2,但一开始就卡住了。我有成千上万的 xml 文件,如下所示:
<document … loads of attribs …>
<foo … loads of attribs …>
<loads…> … </loads>
<of…> … </of>
<other…> … </other>
<tags…> … </tags>
</foo>
</document>
我希望 <foo></foo>
中的所有内容都作为一个字符串。我下面的代码有效,但在我看来这不是正确的方法
let isFoo = false;
let txt = '';
const p = new htmlparser.Parser({
onopentag: function(name, attribs){
if (name === 'foo') {
isFoo = true;
}
},
ontext: function(text){
if (isFoo) {
txt += text;
}
},
onclosetag: function(tagname){
if (tagname === 'foo') {
isFoo = false;
return txt;
}
}
}, {decodeEntities: true, xmlMode: true});
let data = [];
for (let file in files) {
let record = {
filename: file,
filetext: p.write(file)
}
data.push(record);
p.end();
}
有没有更好的方法来使用 htmlparser2 而没有那个愚蠢的 isFoo
标志?
这是一种可能的方法,灵感来自 DomHandler's NPM page 上给出的示例,以及 h.DomUtils
.
上一个丑陋的 console.log
const h = require('htmlparser2');
const fs = require('fs');
const data = []; // your output
files.map((file) => { // files is assumed to be populated
const record = {
filename: file
};
data.push(record);
const dh = new h.DomHandler((err, dom) => {
if (err) return record.err = err;
// DomUtils has many useful methods, most of them you know already, pick your preferred
const e = h.DomUtils.getElementsByTagName('foo', dom)[0];
// getText: only text nodes, getInnerHTML: everything, inner tags included
record.filetext = h.DomUtils.getText(e);
});
const parser = new h.Parser(dh, {decodeEntities: true, xmlMode: true});
fs.readFile(file, (err, content) => {
if (err) return record.err = err;
parser.write(content);
parser.end();
});
});
我正在试用 node-htmlparser2,但一开始就卡住了。我有成千上万的 xml 文件,如下所示:
<document … loads of attribs …>
<foo … loads of attribs …>
<loads…> … </loads>
<of…> … </of>
<other…> … </other>
<tags…> … </tags>
</foo>
</document>
我希望 <foo></foo>
中的所有内容都作为一个字符串。我下面的代码有效,但在我看来这不是正确的方法
let isFoo = false;
let txt = '';
const p = new htmlparser.Parser({
onopentag: function(name, attribs){
if (name === 'foo') {
isFoo = true;
}
},
ontext: function(text){
if (isFoo) {
txt += text;
}
},
onclosetag: function(tagname){
if (tagname === 'foo') {
isFoo = false;
return txt;
}
}
}, {decodeEntities: true, xmlMode: true});
let data = [];
for (let file in files) {
let record = {
filename: file,
filetext: p.write(file)
}
data.push(record);
p.end();
}
有没有更好的方法来使用 htmlparser2 而没有那个愚蠢的 isFoo
标志?
这是一种可能的方法,灵感来自 DomHandler's NPM page 上给出的示例,以及 h.DomUtils
.
console.log
const h = require('htmlparser2');
const fs = require('fs');
const data = []; // your output
files.map((file) => { // files is assumed to be populated
const record = {
filename: file
};
data.push(record);
const dh = new h.DomHandler((err, dom) => {
if (err) return record.err = err;
// DomUtils has many useful methods, most of them you know already, pick your preferred
const e = h.DomUtils.getElementsByTagName('foo', dom)[0];
// getText: only text nodes, getInnerHTML: everything, inner tags included
record.filetext = h.DomUtils.getText(e);
});
const parser = new h.Parser(dh, {decodeEntities: true, xmlMode: true});
fs.readFile(file, (err, content) => {
if (err) return record.err = err;
parser.write(content);
parser.end();
});
});