使用 phantomJS 抓取 javascript 网页
Scraping javascript webpage with phantomJS
我正在尝试使用 phantomJS 抓取动态网页。下面是代码,url 我正在尝试抓取。该代码适用于其他 url,但此代码总是作为空白 html 文档返回。有谁知道如何解决这个问题?
我对javascript不是很熟悉,所以这段代码是从别处复制的。我已将超时时间从 2.5 秒增加到 30 秒,但没有任何区别。
var url ='https://www.amazon.com/gp/profile/amzn1.account.AFJ6MBZ5CSY4R6K4USNMQ7JWEQCA/';
var page = new WebPage()
var fs = require('fs');
page.open(url, function (status) {
just_wait();
});
function just_wait() {
setTimeout(function() {
fs.write('page.html', page.content, 'w');
phantom.exit();
}, 30000);
}
我就是这样解决这些问题的。
app.js
var url ='https://www.amazon.com/gp/profile/amzn1.account.AFJ6MBZ5CSY4R6K4USNMQ7JWEQCA/';
var steps=[];
var testindex = 0;
var loadInProgress = false;
//This is set to true when a page is still loading
/*********SETTINGS*********************/
var settings = require('./settings');
var webPage = require('webpage');
var page = webPage.create();
var fs = require('fs');
page.settings.userAgent = settings.userAgents.desktop;
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;
//Script is much faster with this field set to false
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;
page.viewportSize = {
width: settings.viewport.desktop.width,
height: settings.viewport.desktop.height
};
/*********SETTINGS END*****************/
console.log('All settings Loaded, Start With Execution');
/**********DEFINE STEPS THAT PHANTOM SHOULD DO***********************/
steps = [
function(){
console.log("Step 1 - Load Page => "+url);
page.open(url, function(status){
if(status === 'success'){
console.log('Loaded');
}else{
console.log('Error Loading Page. Try Logging In Again');
phantom.exit(0);
}
});
},
function(){
page.render('./test.png');
},
];
/**********END STEPS THAT PHANTOM SHOULD DO***********************/
interval = setInterval(executeRequestsStepByStep, 3000);
function executeRequestsStepByStep(){
if(loadInProgress == false && typeof steps[testindex] == "function") {
steps[testindex]();
testindex++;
return;
}
if(typeof steps[testindex] != "function") {
console.log("Quiting");
fs.write('page.html', page.content, 'w');
phantom.exit(0);
}
}
/*
* These listeners are very important in order to phantom work properly.
* Using these listeners, we control loadInProgress marker which controls, weather a page is fully loaded.
* Without this, we will get content of the page, even a page is not fully loaded.
*/
page.onLoadStarted = function() {
loadInProgress = true;
};
page.onLoadFinished = function() {
loadInProgress = false;
};
page.onConsoleMessage = function(msg) {
// console.log(msg);
};
phantom.onError = function(msg, trace) {
var msgStack = ['PHANTOM ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
});
}
console.error(msgStack.join('\n'));
phantom.exit(1);
};
settings.js
module.exports = {
viewport: {
desktop: {
height: 663,
width: 1200
}
},
userAgents: {
desktop: 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
};
我已经用这个测试过,效果很好。
我正在尝试使用 phantomJS 抓取动态网页。下面是代码,url 我正在尝试抓取。该代码适用于其他 url,但此代码总是作为空白 html 文档返回。有谁知道如何解决这个问题?
我对javascript不是很熟悉,所以这段代码是从别处复制的。我已将超时时间从 2.5 秒增加到 30 秒,但没有任何区别。
var url ='https://www.amazon.com/gp/profile/amzn1.account.AFJ6MBZ5CSY4R6K4USNMQ7JWEQCA/';
var page = new WebPage()
var fs = require('fs');
page.open(url, function (status) {
just_wait();
});
function just_wait() {
setTimeout(function() {
fs.write('page.html', page.content, 'w');
phantom.exit();
}, 30000);
}
我就是这样解决这些问题的。
app.js
var url ='https://www.amazon.com/gp/profile/amzn1.account.AFJ6MBZ5CSY4R6K4USNMQ7JWEQCA/';
var steps=[];
var testindex = 0;
var loadInProgress = false;
//This is set to true when a page is still loading
/*********SETTINGS*********************/
var settings = require('./settings');
var webPage = require('webpage');
var page = webPage.create();
var fs = require('fs');
page.settings.userAgent = settings.userAgents.desktop;
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;
//Script is much faster with this field set to false
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;
page.viewportSize = {
width: settings.viewport.desktop.width,
height: settings.viewport.desktop.height
};
/*********SETTINGS END*****************/
console.log('All settings Loaded, Start With Execution');
/**********DEFINE STEPS THAT PHANTOM SHOULD DO***********************/
steps = [
function(){
console.log("Step 1 - Load Page => "+url);
page.open(url, function(status){
if(status === 'success'){
console.log('Loaded');
}else{
console.log('Error Loading Page. Try Logging In Again');
phantom.exit(0);
}
});
},
function(){
page.render('./test.png');
},
];
/**********END STEPS THAT PHANTOM SHOULD DO***********************/
interval = setInterval(executeRequestsStepByStep, 3000);
function executeRequestsStepByStep(){
if(loadInProgress == false && typeof steps[testindex] == "function") {
steps[testindex]();
testindex++;
return;
}
if(typeof steps[testindex] != "function") {
console.log("Quiting");
fs.write('page.html', page.content, 'w');
phantom.exit(0);
}
}
/*
* These listeners are very important in order to phantom work properly.
* Using these listeners, we control loadInProgress marker which controls, weather a page is fully loaded.
* Without this, we will get content of the page, even a page is not fully loaded.
*/
page.onLoadStarted = function() {
loadInProgress = true;
};
page.onLoadFinished = function() {
loadInProgress = false;
};
page.onConsoleMessage = function(msg) {
// console.log(msg);
};
phantom.onError = function(msg, trace) {
var msgStack = ['PHANTOM ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
});
}
console.error(msgStack.join('\n'));
phantom.exit(1);
};
settings.js
module.exports = {
viewport: {
desktop: {
height: 663,
width: 1200
}
},
userAgents: {
desktop: 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
};
我已经用这个测试过,效果很好。