如何使用 Cheerio 抓取网络数据?
How to scrape web data using Cheerio?
我有解决方案想从 url 中获取一些内容。
我在 server.js 中有代码:
let request = require('request');
let cheerio = require('cheerio');
let url = "domain[dot]com/title-to-video";
request(url, function(error, response, html){
if (!error && response.statusCode == 200) {
let $ = cheerio.load(html);
console.log($.text());
} else {
console("We've encountered an error: " + error);
}
});
它return html:
<html>
<head>
<title>Website Title</title>
</head>
<body>
...
<script>
getplayer.setvd1('http://abc[dot]com/video34345453.mp4');
getplayer.setvd2('http://abc[dot]com/video43243234.mp4');
</script>
...
</body>
</html>
我希望输出为:
{
http://abc[dot]com/video34345453.mp4,
http://abc[dot]com/video43243234.mp4
}
更新:
在网上找到后,我尝试修复:
const strvd1 = $('script:not([src])')[0].children[0].data;
const resultvd1 = strvd1.match(/setvd1\('(https:.+?)'\);/)[1];
output: http://abc[dot]com/video34345453.mp4
const strvd2 = $('script:not([src])')[0].children[0].data;
const resultvd2 = strvd2.match(/setvd2\('(https:.+?)'\);/)[1];
output: http://abc[dot]com/video43243234.mp4
谢谢任何解决方案。
在你的 if 语句中,尝试类似的东西:
var $ = cheerio.load(html),
script = $('script').text(),
scriptParts = script.split(';');
console.log('{');
for (var i = 0; i < scriptParts.length; i++) {
if (scriptParts[i].trim().length > 0) {
var startPosition = scriptParts[i].indexOf('(') + 2,
endPosition = scriptParts[i].indexOf(')') - 1;
console.log(scriptParts[i].slice(startPosition, endPosition) + ',');
}
}
console.log('}');
您可以通过以下方式访问脚本内容:
1)
$('script').get()[0].attribs[//属性名]
2)
让 $ = cheerio.load(html, {xmlMode: false});
我有解决方案想从 url 中获取一些内容。
我在 server.js 中有代码:
let request = require('request');
let cheerio = require('cheerio');
let url = "domain[dot]com/title-to-video";
request(url, function(error, response, html){
if (!error && response.statusCode == 200) {
let $ = cheerio.load(html);
console.log($.text());
} else {
console("We've encountered an error: " + error);
}
});
它return html:
<html>
<head>
<title>Website Title</title>
</head>
<body>
...
<script>
getplayer.setvd1('http://abc[dot]com/video34345453.mp4');
getplayer.setvd2('http://abc[dot]com/video43243234.mp4');
</script>
...
</body>
</html>
我希望输出为:
{
http://abc[dot]com/video34345453.mp4,
http://abc[dot]com/video43243234.mp4
}
更新: 在网上找到后,我尝试修复:
const strvd1 = $('script:not([src])')[0].children[0].data;
const resultvd1 = strvd1.match(/setvd1\('(https:.+?)'\);/)[1];
output: http://abc[dot]com/video34345453.mp4
const strvd2 = $('script:not([src])')[0].children[0].data;
const resultvd2 = strvd2.match(/setvd2\('(https:.+?)'\);/)[1];
output: http://abc[dot]com/video43243234.mp4
谢谢任何解决方案。
在你的 if 语句中,尝试类似的东西:
var $ = cheerio.load(html),
script = $('script').text(),
scriptParts = script.split(';');
console.log('{');
for (var i = 0; i < scriptParts.length; i++) {
if (scriptParts[i].trim().length > 0) {
var startPosition = scriptParts[i].indexOf('(') + 2,
endPosition = scriptParts[i].indexOf(')') - 1;
console.log(scriptParts[i].slice(startPosition, endPosition) + ',');
}
}
console.log('}');
您可以通过以下方式访问脚本内容:
1) $('script').get()[0].attribs[//属性名]
2) 让 $ = cheerio.load(html, {xmlMode: false});