使用 Node.js 抓取 JavaScript 生成的网站
Scraping JavaScript-generated website with Node.js
当我解析静态 html 页面时,我的 node.js 应用运行良好。但是,当 url 是 JavaScript 生成的页面时,该应用程序无法运行。如何抓取 JavaScript 生成的网页?
我的app.js
var express = require('express'),
fs = require('fs'),
request = require('request'),
cheerio = require('cheerio'),
app = express();
app.get('/scrape', function( req, res ) {
url = 'http://www.apache.org/';
request( url, function( error, response, html ) {
if( !error ) {
var $ = cheerio.load(html);
var title, release, rating;
var json = { title : "" };
$('body').filter(function() {
var data = $(this);
title = data.find('.panel-title').text();
json.title = title;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log( 'File successfully written! - Check your project directory for the output.json file' );
});
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send( 'Check your console!' );
});
});
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Cheerio 不会在页面上执行 javascript,因为它只是用于解析普通 HTML。
我建议使用 PhantomJS 之类的不同方法:http://phantomjs.org/
当我解析静态 html 页面时,我的 node.js 应用运行良好。但是,当 url 是 JavaScript 生成的页面时,该应用程序无法运行。如何抓取 JavaScript 生成的网页?
我的app.js
var express = require('express'),
fs = require('fs'),
request = require('request'),
cheerio = require('cheerio'),
app = express();
app.get('/scrape', function( req, res ) {
url = 'http://www.apache.org/';
request( url, function( error, response, html ) {
if( !error ) {
var $ = cheerio.load(html);
var title, release, rating;
var json = { title : "" };
$('body').filter(function() {
var data = $(this);
title = data.find('.panel-title').text();
json.title = title;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log( 'File successfully written! - Check your project directory for the output.json file' );
});
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send( 'Check your console!' );
});
});
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Cheerio 不会在页面上执行 javascript,因为它只是用于解析普通 HTML。
我建议使用 PhantomJS 之类的不同方法:http://phantomjs.org/