在 NodeJS 中使用嵌套回调时遇到问题

Question

我正在编写一个程序，用于抓取网站的链接，然后抓取这些链接以获取信息。为了抓取网站，必须先登录。所以顺序是：登录 -> 抓取链接索引 -> 抓取信息链接

登录函数的回调打印了一个空数组{ results: [], hasMore: true }，所以我的代码有问题（抓取部分有效）：

var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');

var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";

var credentials = {
    username: 'user1',
    password: 'passpass'
};

login(function (result) {
    console.log(result);
});

function login(callback) {
    request.post({
        uri: loginUrl,
        headers: { 'content-type': 'application/x-www-form-urlencoded' },
        body: require('querystring').stringify(credentials)
    }, function(err, res, body){
        if(err) {
            console.log("Login error");
            return;
        }
        scrapeTorrents(url1, function (result) {
            callback(result);
        });
    });
}

function scrapeTorrents(url, callback) {
    request(url, function(err, res, body) {
        if(err) {
            console.log("Main scrape error");
            return;
        }
        var links = []
        var $ = cheerio.load(body);
        $('span.title').each(function(i, element){
            var title = $(this);
            var a = $(this).children().eq(0);
            var detailsUrl = a.attr('href');
            //console.log(detailsUrl);
            links.push(detailsUrl);
        });
         scrapeTorrentDetails(links, function (result) {
             callback(result);
         });
    });
}

function scrapeTorrentDetails(links, callback) {
    var results = [];

    function getDetails(url) {
        request(url, function(err, res, body) {
                if(err) {
                    console.log("Detail scrape error");
                    return;
                }
                console.log("Scraping: " + url);
                var $ = cheerio.load(body);
                var tds = $('td');
                var title = $(tds).get(1).firstChild.data;
                var hash = $(tds).get(3).firstChild.data.trim();
                var size = $(tds).get(9).firstChild.data;
                //  console.log(tds.length);
                if (tds.length > 23) {
                    var rlsDate = $(tds).get(23).firstChild.data || '';;
                    var genres = $(tds).get(27).firstChild.data || '';;
                    var runtime = $(tds).get(31).firstChild.data || '';;
                    if ( $(tds).get(33).firstChild != null) {
                        var plot = $(tds).get(33).firstChild.data || '';;
                    }
                    var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
                    var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
                    var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
                    var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
                    if (typeof cover == 'undefined') {
                        cover = thumb;
                    }
                } else {
                    var rlsDate = "notfound";
                    var genres = "notfound";
                    var runtime = "notfound";
                    var plot = "notfound";
                    var rating = "notfound"; // of 10
                    var imdb_id = "notfound";
                    var cover = "notfound";
                    var thumb = "notfound";
                }

                var movie = {
                    type: 'movie',
                    imdb_id: imdb_id,
                    title: title,
                    year: rlsDate,
                    genre: genres,
                    rating: rating,
                    runtime: runtime,
                    image: thumb,
                    cover: cover,
                    synopsis: plot,
                    torrents: {
                        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
                        filesize: size
                    }
                };

                results.push(movie);
            });
    }

    for (var i=0; i<links.length; i++){
            getDetails("https://example.org" + links[i]);
    }

    callback( {
        results: results,
        hasMore: true
    });
}

也许Q承诺会更好。我将如何在上面的代码中实现它？

如果您想知道代码的用途，我打算修改 Popcorn-time 以使用另一个 torrent-tracker（没有 API）。

谢谢

Answer 1

主要问题在于此代码：

for (var i=0; i<links.length; i++){
        getDetails("https://example.org" + links[i]);
}

callback( {
    results: results,
    hasMore: true
});

getDetails() 是异步的，但你只需调用它 links.length 次然后继续 - 就像它们都已完成一样。因此，getDetails() 中的 none 请求在您调用回调并尝试传递结果之前完成。但是，none 个结果尚未填写，因此它们将是空的。

您的代码中到处都有所有这些其他嵌套回调（根据需要），但您却在这个地方丢了球。在使用结果调用最终回调之前，您需要知道所有 getDetails() 调用何时完成。

此外，您还必须决定是否可以同时调用所有 getDetails() 调用（同时进行所有调用），或者您真正想做的是调用一个，等等让它完成，然后调用下一个，等等......现在你正在将它们全部放在飞行中，如果目标服务器不反对同时那么多请求，这可以工作。

有几种潜在的策略可以解决这个问题。

向 getDetails() 添加回调，然后记录您何时从 getDetails() 获得 links.length 回调，并且仅当整个计数完成时所以你调用最后的回调。
将getDetails()更改为return一个承诺。然后你可以使用 links.map(getDetails) 之类的东西来创建一个承诺数组，然后你可以使用 Promise.all() 来知道它们何时完成。

就我个人而言，我会更改您的所有代码以使用 promises，并且我会使用 Bluebird promises 库，因为它具有 Promise.map() 等额外功能，使这更简单。

这是一个向 getDetails() 添加回调的修复程序，然后计算完成的次数：

function scrapeTorrentDetails(links, callback) {
    var results = [];

    function getDetails(url, done) {
        request(url, function(err, res, body) {
                if(err) {
                    console.log("Detail scrape error");
                    done(err);
                    return;
                }
                console.log("Scraping: " + url);
                var $ = cheerio.load(body);
                var tds = $('td');
                var title = $(tds).get(1).firstChild.data;
                var hash = $(tds).get(3).firstChild.data.trim();
                var size = $(tds).get(9).firstChild.data;
                //  console.log(tds.length);
                if (tds.length > 23) {
                    var rlsDate = $(tds).get(23).firstChild.data || '';;
                    var genres = $(tds).get(27).firstChild.data || '';;
                    var runtime = $(tds).get(31).firstChild.data || '';;
                    if ( $(tds).get(33).firstChild != null) {
                        var plot = $(tds).get(33).firstChild.data || '';;
                    }
                    var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
                    var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
                    var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
                    var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
                    if (typeof cover == 'undefined') {
                        cover = thumb;
                    }
                } else {
                    var rlsDate = "notfound";
                    var genres = "notfound";
                    var runtime = "notfound";
                    var plot = "notfound";
                    var rating = "notfound"; // of 10
                    var imdb_id = "notfound";
                    var cover = "notfound";
                    var thumb = "notfound";
                }

                var movie = {
                    type: 'movie',
                    imdb_id: imdb_id,
                    title: title,
                    year: rlsDate,
                    genre: genres,
                    rating: rating,
                    runtime: runtime,
                    image: thumb,
                    cover: cover,
                    synopsis: plot,
                    torrents: {
                        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
                        filesize: size
                    }
                };

                results.push(movie);
                done();
            });
    }

    var doneCnt = 0;
    for (var i=0; i<links.length; i++){
        getDetails("https://example.org" + links[i], function() {
            ++doneCnt;
            if (doneCnt === links.length) {
                callback( {
                    results: results,
                    hasMore: true
                });
            }
        });
    }

}

Answer 2

以下是重写为使用绑定、自定义 this 对象和尚未完成的请求计数的给定示例代码（我认为 promises 掩盖了执行路径）。

回调返回空数组的原因似乎是文档中没有带有 title 属性的跨度，因此没有触发进一步的请求。

var
  request = require('request').defaults({
    jar: true
  }), // necessary for persistent login
  cheerio = require('cheerio'),
  process = require('process'),

  url1 = "https://example.org/torrents/browse/index/",
  loginUrl = "https://example.org/user/account/login/",

  login = function(callback) {
    request.post({
      uri: loginUrl,
      headers: {
        'content-type': 'application/x-www-form-urlencoded'
      },
      body: require('querystring').stringify({
        username: 'user1',
        password: 'passpass'
      })
    }, fna.bind({
      callback: callback
    }));
  },

  fna = function(err, res, body) {
    if (err) {
      console.log("Login error");
      return;
    }

    request(url1, fnb.bind(this));
  },

  fnb = function(err, res, body) {
    if (err) {
      console.log("Main scrape error");
      return;
    }

    var
      $ = cheerio.load(body),
      links = [],
      fnd = fne.bind(this);

    $('span.title').each(function() {
      links.push($(this).children().first().attr('href'));
    });

    this.results = [];
    this.resultCount = links.length;

    if (this.resultCount) {
      fnd = fnc.bind(this);

      for (var i = 0; i < links.length; i++) {
        request("https://example.org" + links[i], fnd);
      }
    } else {
      process.nextTick(fnd);
    }
  },

  fnc = function(err, res, body) {
    if (err) {
      console.log("Detail scrape error");
      return;
    }

    console.log("Scraping: " + url);

    var
      $ = cheerio.load(body),
      tds = $('td'),
      title = $(tds).get(1).firstChild.data,
      hash = $(tds).get(3).firstChild.data.trim(),
      size = $(tds).get(9).firstChild.data,
      rlsDate = "notfound",
      genres = "notfound",
      runtime = "notfound",
      plot = "notfound",
      rating = "notfound", // of 10
      imdb_id = "notfound",
      cover = "notfound",
      thumb = "notfound";

    if (tds.length > 23) {
      rlsDate = $(tds).get(23).firstChild.data || '';
      genres = $(tds).get(27).firstChild.data || '';
      runtime = $(tds).get(31).firstChild.data || '';

      if ($(tds).get(33).firstChild != null) {
        plot = $(tds).get(33).firstChild.data || '';
      }

      rating = $('#imdb_rating').parent().next().text() || ''; // of 10
      imdb_id = $('[name=imdbID]').get(0).attribs.value || '';
      cover = $('#cover').children().eq(0).get(0).attribs.href || '';
      thumb = $('[alt=Cover]').get(0).attribs.src || '';

      if (typeof cover == 'undefined') {
        cover = thumb;
      }
    }

    this.results.push({
      type: 'movie',
      imdb_id: imdb_id,
      title: title,
      year: rlsDate,
      genre: genres,
      rating: rating,
      runtime: runtime,
      image: thumb,
      cover: cover,
      synopsis: plot,
      torrents: {
        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
        filesize: size
      }
    });

    this.resultCount--;

    if (this.resultCount === 0) {
      this.callback({
        results: this.results,
        hasMore: true
      });
    }
  },

  fne = function() {
    this.callback({
      results: this.results,
      hasMore: true
    });
  };

login(function(result) {
  console.log(result);
});

在 NodeJS 中使用嵌套回调时遇到问题

Trouble using nested callbacks in NodeJS

javascript

callback

node.js

cheerio