将多个抓取数据存储在一个数组中
Store multiple scraping data in an Array
第一部分:
目前我抓取一个网页获取links,然后我打开每个抓取的link这部分工作完美。
第二部分:
我检查是否有一个 (select) 选项字段的 selection 并将值 (id) 存储在数组中这部分也可以正常工作。
第三部分:
我想遍历它们(select 选项)并触发点击事件并等待 AJAX 响应然后提取 infoProduct 然后将其存储在数组中。
我在这部分遇到了问题 我得到了一个空数组,因为 return listProducts
在第一个 this.eachThen
开始之前被调用。
function getInfosProduct(obj,variation) {
if (!variation) {
return [{
name: getName(),
image: products.image.getElement.link + getImage(),
url: obj.url
}];
} else {
return {
name: getName(),
image: products.image.getElement.link + getImage(),
url: variation.url,
idVariation: variation.id,
descVariation: variation.description
};
}
}
function clickVariation(variation) {
if (variation.ok && variation.id != variation.ignore) {
chooseVariation(products.selector, variation.id);
return true;
}
return false;
}
casper.getInfosProducts = function(obj) {
if (obj.level == 0) {
return this.evaluate(getInfosProduct, obj,false);
} else {
listProducts = [];
this.eachThen(obj.levelVariation, function getInfosProducts(variation) {
isClick = this.evaluate(clickVariation, variation.data)
if (isClick) {
this.waitForSelectorTextChange('.selector', function() {
this.echo('The text on .selector has been changed.');
});
listProducts.push(this.evaluate(getInfosProduct, obj,variation.data));
}
});
return listProducts;
}
};
函数触发 select 元素上的更改事件
function chooseVariation(selector, valueToMatch) {
var select = document.querySelectorAll(selector),
found = false;
Array.prototype.forEach.call(select, function(opt, i) {
if (!found && opt.value.indexOf(valueToMatch) !== -1) {
select.selectedIndex = i;
found = true;
}
});
// dispatch change event in case there is some kind of validation
var evt = document.createEvent("UIEvents"); // or "HTMLEvents"
evt.initUIEvent("change", true, true);
select[0].dispatchEvent(evt);
}
这是主要功能:
function startSraping(obj) {
casper.then(function switchAction() {
switch (obj.action) {
//......... some code ........
// iterating through array links and open pages
case "openLinks":
this.each(links, function eachOpenLinks(self, link) {
if (link.ok) {
self.thenOpen(link.url, function thenOpenLinks() {
startSraping({
url: link.url,
action: "getVariations"
});
});
}
});
break;
// get all variations for each page opend
case "getVariations":
objVariations = this.getVariations(obj.url);
startSraping({
url: obj.url,
action: "getInfosProducts",
objVariations: objVariations
});
break;
case "getInfosProducts":
this.eachThen(obj.objVariations.list, function(levelVariation) {
infosProd = this.getInfosProducts({
levelVariation: levelVariation.data,
url: obj.url,
level: obj.objVariations.level
});
// Here I got an empty array
this.echo(JSON.stringify(infosProd), 'INFO');
});
break;
}
});
}
casper.start(url, function start() {
startSraping({
variation: variation,
action: "submitSearch"
});
});
casper.run();
您不能在应该 return 异步函数结果的函数内部调用异步函数(eachThen
和 waitForSelectorTextChange
都是异步的)同步时尚 (general reference)。由于 CasperJS 不支持 Promises,这有点棘手。
我认为以下更改应该是最小的,并且可以让您到达您想去的地方。
casper.getInfosProducts = function(obj, callback) {
if (obj.level == 0) {
this.then(function(){
callback.call(this, arr.push(this.evaluate(getInfosProduct, obj,false));
});
} else {
var listProducts = [];
this.eachThen(obj.levelVariation, function getInfosProducts(variation) {
var isClick = this.evaluate(clickVariation, variation.data)
if (isClick) {
this.waitForSelectorTextChange('.selector', function() {
this.echo('The text on .selector has been changed.');
listProducts.push(this.evaluate(getInfosProduct, obj, variation.data));
});
}
});
this.then(function(){
callback.call(this, listProducts);
});
}
};
在startSraping
中:
case "getInfosProducts":
this.eachThen(obj.objVariations.list, function(levelVariation) {
this.getInfosProducts({
levelVariation: levelVariation.data,
url: obj.url,
level: obj.objVariations.level
}, function (infosProd){
// this is the asynchronous callback
this.echo(JSON.stringify(infosProd), 'INFO');
});
});
break;
第一部分:
目前我抓取一个网页获取links,然后我打开每个抓取的link这部分工作完美。
第二部分:
我检查是否有一个 (select) 选项字段的 selection 并将值 (id) 存储在数组中这部分也可以正常工作。
第三部分:
我想遍历它们(select 选项)并触发点击事件并等待 AJAX 响应然后提取 infoProduct 然后将其存储在数组中。
我在这部分遇到了问题 我得到了一个空数组,因为 return listProducts
在第一个 this.eachThen
开始之前被调用。
function getInfosProduct(obj,variation) {
if (!variation) {
return [{
name: getName(),
image: products.image.getElement.link + getImage(),
url: obj.url
}];
} else {
return {
name: getName(),
image: products.image.getElement.link + getImage(),
url: variation.url,
idVariation: variation.id,
descVariation: variation.description
};
}
}
function clickVariation(variation) {
if (variation.ok && variation.id != variation.ignore) {
chooseVariation(products.selector, variation.id);
return true;
}
return false;
}
casper.getInfosProducts = function(obj) {
if (obj.level == 0) {
return this.evaluate(getInfosProduct, obj,false);
} else {
listProducts = [];
this.eachThen(obj.levelVariation, function getInfosProducts(variation) {
isClick = this.evaluate(clickVariation, variation.data)
if (isClick) {
this.waitForSelectorTextChange('.selector', function() {
this.echo('The text on .selector has been changed.');
});
listProducts.push(this.evaluate(getInfosProduct, obj,variation.data));
}
});
return listProducts;
}
};
函数触发 select 元素上的更改事件
function chooseVariation(selector, valueToMatch) {
var select = document.querySelectorAll(selector),
found = false;
Array.prototype.forEach.call(select, function(opt, i) {
if (!found && opt.value.indexOf(valueToMatch) !== -1) {
select.selectedIndex = i;
found = true;
}
});
// dispatch change event in case there is some kind of validation
var evt = document.createEvent("UIEvents"); // or "HTMLEvents"
evt.initUIEvent("change", true, true);
select[0].dispatchEvent(evt);
}
这是主要功能:
function startSraping(obj) {
casper.then(function switchAction() {
switch (obj.action) {
//......... some code ........
// iterating through array links and open pages
case "openLinks":
this.each(links, function eachOpenLinks(self, link) {
if (link.ok) {
self.thenOpen(link.url, function thenOpenLinks() {
startSraping({
url: link.url,
action: "getVariations"
});
});
}
});
break;
// get all variations for each page opend
case "getVariations":
objVariations = this.getVariations(obj.url);
startSraping({
url: obj.url,
action: "getInfosProducts",
objVariations: objVariations
});
break;
case "getInfosProducts":
this.eachThen(obj.objVariations.list, function(levelVariation) {
infosProd = this.getInfosProducts({
levelVariation: levelVariation.data,
url: obj.url,
level: obj.objVariations.level
});
// Here I got an empty array
this.echo(JSON.stringify(infosProd), 'INFO');
});
break;
}
});
}
casper.start(url, function start() {
startSraping({
variation: variation,
action: "submitSearch"
});
});
casper.run();
您不能在应该 return 异步函数结果的函数内部调用异步函数(eachThen
和 waitForSelectorTextChange
都是异步的)同步时尚 (general reference)。由于 CasperJS 不支持 Promises,这有点棘手。
我认为以下更改应该是最小的,并且可以让您到达您想去的地方。
casper.getInfosProducts = function(obj, callback) {
if (obj.level == 0) {
this.then(function(){
callback.call(this, arr.push(this.evaluate(getInfosProduct, obj,false));
});
} else {
var listProducts = [];
this.eachThen(obj.levelVariation, function getInfosProducts(variation) {
var isClick = this.evaluate(clickVariation, variation.data)
if (isClick) {
this.waitForSelectorTextChange('.selector', function() {
this.echo('The text on .selector has been changed.');
listProducts.push(this.evaluate(getInfosProduct, obj, variation.data));
});
}
});
this.then(function(){
callback.call(this, listProducts);
});
}
};
在startSraping
中:
case "getInfosProducts":
this.eachThen(obj.objVariations.list, function(levelVariation) {
this.getInfosProducts({
levelVariation: levelVariation.data,
url: obj.url,
level: obj.objVariations.level
}, function (infosProd){
// this is the asynchronous callback
this.echo(JSON.stringify(infosProd), 'INFO');
});
});
break;