在 NodeJS 中使用嵌套回调时遇到问题
Trouble using nested callbacks in NodeJS
我正在编写一个程序,用于抓取网站的链接,然后抓取这些链接以获取信息。为了抓取网站,必须先登录。所以顺序是:登录 -> 抓取链接索引 -> 抓取信息链接
登录函数的回调打印了一个空数组{ results: [], hasMore: true }
,所以我的代码有问题(抓取部分有效):
var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');
var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";
var credentials = {
username: 'user1',
password: 'passpass'
};
login(function (result) {
console.log(result);
});
function login(callback) {
request.post({
uri: loginUrl,
headers: { 'content-type': 'application/x-www-form-urlencoded' },
body: require('querystring').stringify(credentials)
}, function(err, res, body){
if(err) {
console.log("Login error");
return;
}
scrapeTorrents(url1, function (result) {
callback(result);
});
});
}
function scrapeTorrents(url, callback) {
request(url, function(err, res, body) {
if(err) {
console.log("Main scrape error");
return;
}
var links = []
var $ = cheerio.load(body);
$('span.title').each(function(i, element){
var title = $(this);
var a = $(this).children().eq(0);
var detailsUrl = a.attr('href');
//console.log(detailsUrl);
links.push(detailsUrl);
});
scrapeTorrentDetails(links, function (result) {
callback(result);
});
});
}
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
});
}
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
}
也许Q承诺会更好。我将如何在上面的代码中实现它?
如果您想知道代码的用途,我打算修改 Popcorn-time 以使用另一个 torrent-tracker(没有 API)。
谢谢
主要问题在于此代码:
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
getDetails()
是异步的,但你只需调用它 links.length
次然后继续 - 就像它们都已完成一样。因此,getDetails()
中的 none 请求在您调用回调并尝试传递结果之前完成。但是,none 个结果尚未填写,因此它们将是空的。
您的代码中到处都有所有这些其他嵌套回调(根据需要),但您却在这个地方丢了球。在使用结果调用最终回调之前,您需要知道所有 getDetails()
调用何时完成。
此外,您还必须决定是否可以同时调用所有 getDetails()
调用(同时进行所有调用),或者您真正想做的是调用一个,等等让它完成,然后调用下一个,等等......现在你正在将它们全部放在飞行中,如果目标服务器不反对同时那么多请求,这可以工作。
有几种潜在的策略可以解决这个问题。
向 getDetails()
添加回调,然后记录您何时从 getDetails()
获得 links.length
回调,并且仅当整个计数完成时所以你调用最后的回调。
将getDetails()
更改为return一个承诺。然后你可以使用 links.map(getDetails)
之类的东西来创建一个承诺数组,然后你可以使用 Promise.all()
来知道它们何时完成。
就我个人而言,我会更改您的所有代码以使用 promises,并且我会使用 Bluebird promises 库,因为它具有 Promise.map()
等额外功能,使这更简单。
这是一个向 getDetails()
添加回调的修复程序,然后计算完成的次数:
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url, done) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
done(err);
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
done();
});
}
var doneCnt = 0;
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i], function() {
++doneCnt;
if (doneCnt === links.length) {
callback( {
results: results,
hasMore: true
});
}
});
}
}
以下是重写为使用绑定、自定义 this 对象和尚未完成的请求计数的给定示例代码(我认为 promises 掩盖了执行路径)。
回调返回空数组的原因似乎是文档中没有带有 title 属性的跨度,因此没有触发进一步的请求。
var
request = require('request').defaults({
jar: true
}), // necessary for persistent login
cheerio = require('cheerio'),
process = require('process'),
url1 = "https://example.org/torrents/browse/index/",
loginUrl = "https://example.org/user/account/login/",
login = function(callback) {
request.post({
uri: loginUrl,
headers: {
'content-type': 'application/x-www-form-urlencoded'
},
body: require('querystring').stringify({
username: 'user1',
password: 'passpass'
})
}, fna.bind({
callback: callback
}));
},
fna = function(err, res, body) {
if (err) {
console.log("Login error");
return;
}
request(url1, fnb.bind(this));
},
fnb = function(err, res, body) {
if (err) {
console.log("Main scrape error");
return;
}
var
$ = cheerio.load(body),
links = [],
fnd = fne.bind(this);
$('span.title').each(function() {
links.push($(this).children().first().attr('href'));
});
this.results = [];
this.resultCount = links.length;
if (this.resultCount) {
fnd = fnc.bind(this);
for (var i = 0; i < links.length; i++) {
request("https://example.org" + links[i], fnd);
}
} else {
process.nextTick(fnd);
}
},
fnc = function(err, res, body) {
if (err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var
$ = cheerio.load(body),
tds = $('td'),
title = $(tds).get(1).firstChild.data,
hash = $(tds).get(3).firstChild.data.trim(),
size = $(tds).get(9).firstChild.data,
rlsDate = "notfound",
genres = "notfound",
runtime = "notfound",
plot = "notfound",
rating = "notfound", // of 10
imdb_id = "notfound",
cover = "notfound",
thumb = "notfound";
if (tds.length > 23) {
rlsDate = $(tds).get(23).firstChild.data || '';
genres = $(tds).get(27).firstChild.data || '';
runtime = $(tds).get(31).firstChild.data || '';
if ($(tds).get(33).firstChild != null) {
plot = $(tds).get(33).firstChild.data || '';
}
rating = $('#imdb_rating').parent().next().text() || ''; // of 10
imdb_id = $('[name=imdbID]').get(0).attribs.value || '';
cover = $('#cover').children().eq(0).get(0).attribs.href || '';
thumb = $('[alt=Cover]').get(0).attribs.src || '';
if (typeof cover == 'undefined') {
cover = thumb;
}
}
this.results.push({
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
});
this.resultCount--;
if (this.resultCount === 0) {
this.callback({
results: this.results,
hasMore: true
});
}
},
fne = function() {
this.callback({
results: this.results,
hasMore: true
});
};
login(function(result) {
console.log(result);
});
我正在编写一个程序,用于抓取网站的链接,然后抓取这些链接以获取信息。为了抓取网站,必须先登录。所以顺序是:登录 -> 抓取链接索引 -> 抓取信息链接
登录函数的回调打印了一个空数组{ results: [], hasMore: true }
,所以我的代码有问题(抓取部分有效):
var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');
var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";
var credentials = {
username: 'user1',
password: 'passpass'
};
login(function (result) {
console.log(result);
});
function login(callback) {
request.post({
uri: loginUrl,
headers: { 'content-type': 'application/x-www-form-urlencoded' },
body: require('querystring').stringify(credentials)
}, function(err, res, body){
if(err) {
console.log("Login error");
return;
}
scrapeTorrents(url1, function (result) {
callback(result);
});
});
}
function scrapeTorrents(url, callback) {
request(url, function(err, res, body) {
if(err) {
console.log("Main scrape error");
return;
}
var links = []
var $ = cheerio.load(body);
$('span.title').each(function(i, element){
var title = $(this);
var a = $(this).children().eq(0);
var detailsUrl = a.attr('href');
//console.log(detailsUrl);
links.push(detailsUrl);
});
scrapeTorrentDetails(links, function (result) {
callback(result);
});
});
}
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
});
}
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
}
也许Q承诺会更好。我将如何在上面的代码中实现它?
如果您想知道代码的用途,我打算修改 Popcorn-time 以使用另一个 torrent-tracker(没有 API)。
谢谢
主要问题在于此代码:
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
getDetails()
是异步的,但你只需调用它 links.length
次然后继续 - 就像它们都已完成一样。因此,getDetails()
中的 none 请求在您调用回调并尝试传递结果之前完成。但是,none 个结果尚未填写,因此它们将是空的。
您的代码中到处都有所有这些其他嵌套回调(根据需要),但您却在这个地方丢了球。在使用结果调用最终回调之前,您需要知道所有 getDetails()
调用何时完成。
此外,您还必须决定是否可以同时调用所有 getDetails()
调用(同时进行所有调用),或者您真正想做的是调用一个,等等让它完成,然后调用下一个,等等......现在你正在将它们全部放在飞行中,如果目标服务器不反对同时那么多请求,这可以工作。
有几种潜在的策略可以解决这个问题。
向
getDetails()
添加回调,然后记录您何时从getDetails()
获得links.length
回调,并且仅当整个计数完成时所以你调用最后的回调。将
getDetails()
更改为return一个承诺。然后你可以使用links.map(getDetails)
之类的东西来创建一个承诺数组,然后你可以使用Promise.all()
来知道它们何时完成。
就我个人而言,我会更改您的所有代码以使用 promises,并且我会使用 Bluebird promises 库,因为它具有 Promise.map()
等额外功能,使这更简单。
这是一个向 getDetails()
添加回调的修复程序,然后计算完成的次数:
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url, done) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
done(err);
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
done();
});
}
var doneCnt = 0;
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i], function() {
++doneCnt;
if (doneCnt === links.length) {
callback( {
results: results,
hasMore: true
});
}
});
}
}
以下是重写为使用绑定、自定义 this 对象和尚未完成的请求计数的给定示例代码(我认为 promises 掩盖了执行路径)。
回调返回空数组的原因似乎是文档中没有带有 title 属性的跨度,因此没有触发进一步的请求。
var
request = require('request').defaults({
jar: true
}), // necessary for persistent login
cheerio = require('cheerio'),
process = require('process'),
url1 = "https://example.org/torrents/browse/index/",
loginUrl = "https://example.org/user/account/login/",
login = function(callback) {
request.post({
uri: loginUrl,
headers: {
'content-type': 'application/x-www-form-urlencoded'
},
body: require('querystring').stringify({
username: 'user1',
password: 'passpass'
})
}, fna.bind({
callback: callback
}));
},
fna = function(err, res, body) {
if (err) {
console.log("Login error");
return;
}
request(url1, fnb.bind(this));
},
fnb = function(err, res, body) {
if (err) {
console.log("Main scrape error");
return;
}
var
$ = cheerio.load(body),
links = [],
fnd = fne.bind(this);
$('span.title').each(function() {
links.push($(this).children().first().attr('href'));
});
this.results = [];
this.resultCount = links.length;
if (this.resultCount) {
fnd = fnc.bind(this);
for (var i = 0; i < links.length; i++) {
request("https://example.org" + links[i], fnd);
}
} else {
process.nextTick(fnd);
}
},
fnc = function(err, res, body) {
if (err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var
$ = cheerio.load(body),
tds = $('td'),
title = $(tds).get(1).firstChild.data,
hash = $(tds).get(3).firstChild.data.trim(),
size = $(tds).get(9).firstChild.data,
rlsDate = "notfound",
genres = "notfound",
runtime = "notfound",
plot = "notfound",
rating = "notfound", // of 10
imdb_id = "notfound",
cover = "notfound",
thumb = "notfound";
if (tds.length > 23) {
rlsDate = $(tds).get(23).firstChild.data || '';
genres = $(tds).get(27).firstChild.data || '';
runtime = $(tds).get(31).firstChild.data || '';
if ($(tds).get(33).firstChild != null) {
plot = $(tds).get(33).firstChild.data || '';
}
rating = $('#imdb_rating').parent().next().text() || ''; // of 10
imdb_id = $('[name=imdbID]').get(0).attribs.value || '';
cover = $('#cover').children().eq(0).get(0).attribs.href || '';
thumb = $('[alt=Cover]').get(0).attribs.src || '';
if (typeof cover == 'undefined') {
cover = thumb;
}
}
this.results.push({
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
});
this.resultCount--;
if (this.resultCount === 0) {
this.callback({
results: this.results,
hasMore: true
});
}
},
fne = function() {
this.callback({
results: this.results,
hasMore: true
});
};
login(function(result) {
console.log(result);
});