CasperJS evaluate() 不是从 each() 块内执行的
CasperJS evaluate() is not executing from within each() block
所以我正在抓取一个页面,收集链接,然后我想抓取这些链接以完成我的数据集。这是一些代码:
crawl.js:
var casper = require("casper").create({
waitTimeout: 3000,
pageSettings: {
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20130404 Firefox/23.0"
},
clientScripts: ["includes/jquery.min.js"],
verbose: true
});
var followers = require('./followers');
var currentPage = 1;
var x = require('casper').selectXPath;
Object.size = function(obj) {
var size = 0, key;
for (key in obj) {
if (obj.hasOwnProperty(key)) size++
}
return size;
};
var collectFollowers = function() {
var url;
this.echo("capturing page " + currentPage);
this.capture("wowhead-p" + currentPage + ".png");
// don't go too far down the rabbit hole
if (currentPage >= 5 || !this.exists(x('//*[text()="Next ›"]'))) {
processFollowers.call(casper);
return terminate.call(casper);
}
currentPage++;
this.echo("requesting next page: " + currentPage);
url = this.getCurrentUrl();
var links = this.evaluate(function() {
var obj = {}
$('.listview-cleartext').map(function(){
obj[$(this).text()] = $(this).attr('href');
});
return obj;
});
for (key in links) {
followers.followers[key] = links[key];
}
this.echo("Page links: " + Object.size(followers.followers));
//this.emit('update.followers', links);
this.thenClick(x('//*[text()="Next ›"]')).then(function() {
this.waitFor(function() {
return url !== this.getCurrentUrl();
}, collectFollowers, processFollowers);
});
};
var processFollowers = function() {
this.echo("Total followers:" + Object.size(followers.followers));
this.each(Object.keys(followers.followers), function(casper, key) {
this.thenOpen('http://wowhead.com' + followers.followers[key]).then(function() {
this.echo("On http://wowhead.com" + followers.followers[key]);
this.evaluate(function() {
this.echo("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
this.echo("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
this.echo("Quest URL: " + questURL);
followers.followers[key] = {"name": key, "quest": {"url": questURL, "name": questName}};
} else {
this.echo("Does not have quest!");
}
});
});
});
}
var terminate = function() {
this.echo("Done.").exit();
}
casper.start("http://wowhead.com/followers=2");
casper.waitForSelector(x('//*[text()="Next ›"]'), collectFollowers, processFollowers);
casper.run();
followers.js:
var require = patchRequire(require);
var utils = require('utils');
var followers = {};
exports.followers = followers;
followers 用于存储全局变量,这是我在抓取页面时不断构建和更新的对象。所以我浏览了 3 页数据,成功收集了链接,然后开始处理它们。就目前而言,CasperJS 似乎成功打开了每个页面,但是从未调用过评估函数。
我能够通过一些异步逻辑让这个功能在 PhantomJS 中工作,但切换到 casper,因为它看起来好像这会在幕后得到处理。我已经尝试了 thenOpen()、then() 和 open()、没有 then() 的 thenOpen() 等的各种组合。我搞砸了什么?
casper.evalute()
is the sandboxed page context in the same way the as the PhantomJS version (page.evaluate()
)。它无权访问外部定义的变量。
evaluate()
里面的this
指的是window
而不是casper
,我怀疑有没有像window.echo()
这样的函数。如果你想从页面上下文中接收控制台消息,你需要注册到 remote.message
事件:
casper.on("remote.message", function(msg){
this.echo("remote: " + msg);
});
您必须明确地将结果传递到页面上下文之外并将其添加到那里:
var result = this.evaluate(function() {
console.log("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
console.log("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
console.log("Quest URL: " + questURL);
return {"url": questURL, "name": questName}};
} else {
console.log("Does not have quest!");
return null;
}
});
if (result) {
followers.followers[key] = {name: key, quest: result};
}
所以我正在抓取一个页面,收集链接,然后我想抓取这些链接以完成我的数据集。这是一些代码:
crawl.js:
var casper = require("casper").create({
waitTimeout: 3000,
pageSettings: {
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20130404 Firefox/23.0"
},
clientScripts: ["includes/jquery.min.js"],
verbose: true
});
var followers = require('./followers');
var currentPage = 1;
var x = require('casper').selectXPath;
Object.size = function(obj) {
var size = 0, key;
for (key in obj) {
if (obj.hasOwnProperty(key)) size++
}
return size;
};
var collectFollowers = function() {
var url;
this.echo("capturing page " + currentPage);
this.capture("wowhead-p" + currentPage + ".png");
// don't go too far down the rabbit hole
if (currentPage >= 5 || !this.exists(x('//*[text()="Next ›"]'))) {
processFollowers.call(casper);
return terminate.call(casper);
}
currentPage++;
this.echo("requesting next page: " + currentPage);
url = this.getCurrentUrl();
var links = this.evaluate(function() {
var obj = {}
$('.listview-cleartext').map(function(){
obj[$(this).text()] = $(this).attr('href');
});
return obj;
});
for (key in links) {
followers.followers[key] = links[key];
}
this.echo("Page links: " + Object.size(followers.followers));
//this.emit('update.followers', links);
this.thenClick(x('//*[text()="Next ›"]')).then(function() {
this.waitFor(function() {
return url !== this.getCurrentUrl();
}, collectFollowers, processFollowers);
});
};
var processFollowers = function() {
this.echo("Total followers:" + Object.size(followers.followers));
this.each(Object.keys(followers.followers), function(casper, key) {
this.thenOpen('http://wowhead.com' + followers.followers[key]).then(function() {
this.echo("On http://wowhead.com" + followers.followers[key]);
this.evaluate(function() {
this.echo("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
this.echo("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
this.echo("Quest URL: " + questURL);
followers.followers[key] = {"name": key, "quest": {"url": questURL, "name": questName}};
} else {
this.echo("Does not have quest!");
}
});
});
});
}
var terminate = function() {
this.echo("Done.").exit();
}
casper.start("http://wowhead.com/followers=2");
casper.waitForSelector(x('//*[text()="Next ›"]'), collectFollowers, processFollowers);
casper.run();
followers.js:
var require = patchRequire(require);
var utils = require('utils');
var followers = {};
exports.followers = followers;
followers 用于存储全局变量,这是我在抓取页面时不断构建和更新的对象。所以我浏览了 3 页数据,成功收集了链接,然后开始处理它们。就目前而言,CasperJS 似乎成功打开了每个页面,但是从未调用过评估函数。
我能够通过一些异步逻辑让这个功能在 PhantomJS 中工作,但切换到 casper,因为它看起来好像这会在幕后得到处理。我已经尝试了 thenOpen()、then() 和 open()、没有 then() 的 thenOpen() 等的各种组合。我搞砸了什么?
casper.evalute()
is the sandboxed page context in the same way the as the PhantomJS version (page.evaluate()
)。它无权访问外部定义的变量。
evaluate()
里面的this
指的是window
而不是casper
,我怀疑有没有像window.echo()
这样的函数。如果你想从页面上下文中接收控制台消息,你需要注册到 remote.message
事件:
casper.on("remote.message", function(msg){
this.echo("remote: " + msg);
});
您必须明确地将结果传递到页面上下文之外并将其添加到那里:
var result = this.evaluate(function() {
console.log("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
console.log("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
console.log("Quest URL: " + questURL);
return {"url": questURL, "name": questName}};
} else {
console.log("Does not have quest!");
return null;
}
});
if (result) {
followers.followers[key] = {name: key, quest: result};
}