html 使用 Node JS 进行操作
html manipulation with Node JS
我想从源(link 或文件,...)获取 html 并且
从中找到价值。
html 格式为:
<!doctype html>
<html>
<body>
<main>
<section id="serp">
<div>
<article>a</article>
<article>b</article>
<article>c</article>
<article>d</article>
</div>
</section>
</main>
</body>
</html>
首先我使用了cheerio。
根据我写的文档:
const cheerio = require('cheerio');
const $ = cheerio.load(myhtml);
const content = $('#serp div').children();
console.log(content); // null
根据相同的程序,我使用了 x-ray 和 jsdom,但它们都是
打印空值。
我做了以下事情:
let myhtml = `<!doctype html>
<html>
<body>
<main>
<section id="serp">
<div>
<article>a</article>
<article>b</article>
<article>c</article>
<article>d</article>
</div>
</section>
</main>
</body>
</html>`;
const cheerio = require('cheerio');
const $ = cheerio.load(myhtml);
const content = $('#serp div').children();
console.log(content);
console.log(`html: ${content.html()}`);
它向控制台输出以下内容:
initialize {
'0':
{ type: 'tag',
name: 'article',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [ [Object] ],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: null,
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
'1':
{ type: 'tag',
name: 'article',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [ [Object] ],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
'2':
{ type: 'tag',
name: 'article',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [ [Object] ],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
'3':
{ type: 'tag',
name: 'article',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [ [Object] ],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: null } },
options:
{ withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] },
length: 4,
prevObject:
initialize {
'0':
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
options:
{ withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root: initialize { '0': [Object], options: [Object], length: 1, _root: [Circular] },
length: 1,
prevObject: initialize { '0': [Object], options: [Object], length: 1, _root: [Circular] } } }
html: a
Process finished with exit code 0
我想从源(link 或文件,...)获取 html 并且 从中找到价值。 html 格式为:
<!doctype html>
<html>
<body>
<main>
<section id="serp">
<div>
<article>a</article>
<article>b</article>
<article>c</article>
<article>d</article>
</div>
</section>
</main>
</body>
</html>
首先我使用了cheerio。 根据我写的文档:
const cheerio = require('cheerio');
const $ = cheerio.load(myhtml);
const content = $('#serp div').children();
console.log(content); // null
根据相同的程序,我使用了 x-ray 和 jsdom,但它们都是 打印空值。
我做了以下事情:
let myhtml = `<!doctype html>
<html>
<body>
<main>
<section id="serp">
<div>
<article>a</article>
<article>b</article>
<article>c</article>
<article>d</article>
</div>
</section>
</main>
</body>
</html>`;
const cheerio = require('cheerio');
const $ = cheerio.load(myhtml);
const content = $('#serp div').children();
console.log(content);
console.log(`html: ${content.html()}`);
它向控制台输出以下内容:
initialize {
'0':
{ type: 'tag',
name: 'article',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [ [Object] ],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: null,
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
'1':
{ type: 'tag',
name: 'article',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [ [Object] ],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
'2':
{ type: 'tag',
name: 'article',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [ [Object] ],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
'3':
{ type: 'tag',
name: 'article',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [ [Object] ],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: null } },
options:
{ withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] },
length: 4,
prevObject:
initialize {
'0':
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Object],
parent: [Object],
prev: [Object],
next: [Object] },
options:
{ withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root: initialize { '0': [Object], options: [Object], length: 1, _root: [Circular] },
length: 1,
prevObject: initialize { '0': [Object], options: [Object], length: 1, _root: [Circular] } } }
html: a
Process finished with exit code 0