Node.js + request + for 循环:运行两次
Node.js + request + for loop : Runs twice
我使用 cheerio 和请求客户端创建了一个简单的抓取工具,但它无法按我想要的方式工作。
首先我在终端上看到所有 "null returned, do nothing" 消息,然后看到名称,所以我认为它首先检查所有 returns 为空的 url,然后是非空。
我希望它以正确的顺序 运行,从 1 到 100。
app.get('/back', function (req, res) {
for (var y = 1; y < 100; y++) {
(function () {
var url = "example.com/person/" + y +;
var options2 = {
url: url,
headers: {
'User-Agent': req.headers['user-agent'],
'Content-Type': 'application/json; charset=utf-8'
}
};
request(options2, function (err, resp, body) {
if (err) {
console.log(err);
} else {
if ($ = cheerio.load(body)) {
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("null returned, do nothing");
} else {
name = entities.decodeHTML(name);
console.log(name);
}
}
else {
console.log("can't open");
}
}
});
}());
}
});
如果您不使用 promises 并且想要按顺序 运行 请求,那么这是 运行 顺序异步循环的常见设计模式:
app.get('/back', function (req, res) {
var cntr = 1;
function next() {
if (cntr < 100) {
var url = "example.com/person/" + cntr++;
var options2 = {
url: url,
headers: {
'User-Agent': req.headers['user-agent'],
'Content-Type': 'application/json; charset=utf-8'
}
};
request(options2, function (err, resp, body) {
if (err) {
console.log(err);
} else {
if ($ = cheerio.load(body)) {
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("null returned, do nothing");
} else {
name = entities.decodeHTML(name);
console.log(name);
}
} else {
console.log("can't open");
}
// do the next iteration
next();
}
});
}
}
// start the first iteration
next();
});
如果您想并行发出所有请求(同时进行多个请求),这将是更快的最终结果,然后在最后按顺序累积所有结果,您可以这样做:
// create promisified version of request()
function requestPromise(options) {
return new Promise(function(resolve, reject) {
request(options2, function (err, resp, body) {
if (err) return reject(err);
resolve(body);
});
});
}
app.get('/back', function (req, res) {
var promises = [];
var headers = {
'User-Agent': req.headers['user-agent'],
'Content-Type': 'application/json; charset=utf-8'
};
for (var i = 1; i < 100; i++) {
promises.push(requestPromise({url: "example.com/person/" + i, headers: headers}));
}
Promise.all(promises).then(function(data) {
// iterate through all the data here
for (var i = 0; i < data.length; i++) {
if ($ = cheerio.load(data[i])) {
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("null returned, do nothing");
} else {
name = entities.decodeHTML(name);
console.log(name);
}
} else {
console.log("can't open");
}
}
}, function(err) {
// error occurred here
});
});
我使用 cheerio 和请求客户端创建了一个简单的抓取工具,但它无法按我想要的方式工作。
首先我在终端上看到所有 "null returned, do nothing" 消息,然后看到名称,所以我认为它首先检查所有 returns 为空的 url,然后是非空。
我希望它以正确的顺序 运行,从 1 到 100。
app.get('/back', function (req, res) {
for (var y = 1; y < 100; y++) {
(function () {
var url = "example.com/person/" + y +;
var options2 = {
url: url,
headers: {
'User-Agent': req.headers['user-agent'],
'Content-Type': 'application/json; charset=utf-8'
}
};
request(options2, function (err, resp, body) {
if (err) {
console.log(err);
} else {
if ($ = cheerio.load(body)) {
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("null returned, do nothing");
} else {
name = entities.decodeHTML(name);
console.log(name);
}
}
else {
console.log("can't open");
}
}
});
}());
}
});
如果您不使用 promises 并且想要按顺序 运行 请求,那么这是 运行 顺序异步循环的常见设计模式:
app.get('/back', function (req, res) {
var cntr = 1;
function next() {
if (cntr < 100) {
var url = "example.com/person/" + cntr++;
var options2 = {
url: url,
headers: {
'User-Agent': req.headers['user-agent'],
'Content-Type': 'application/json; charset=utf-8'
}
};
request(options2, function (err, resp, body) {
if (err) {
console.log(err);
} else {
if ($ = cheerio.load(body)) {
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("null returned, do nothing");
} else {
name = entities.decodeHTML(name);
console.log(name);
}
} else {
console.log("can't open");
}
// do the next iteration
next();
}
});
}
}
// start the first iteration
next();
});
如果您想并行发出所有请求(同时进行多个请求),这将是更快的最终结果,然后在最后按顺序累积所有结果,您可以这样做:
// create promisified version of request()
function requestPromise(options) {
return new Promise(function(resolve, reject) {
request(options2, function (err, resp, body) {
if (err) return reject(err);
resolve(body);
});
});
}
app.get('/back', function (req, res) {
var promises = [];
var headers = {
'User-Agent': req.headers['user-agent'],
'Content-Type': 'application/json; charset=utf-8'
};
for (var i = 1; i < 100; i++) {
promises.push(requestPromise({url: "example.com/person/" + i, headers: headers}));
}
Promise.all(promises).then(function(data) {
// iterate through all the data here
for (var i = 0; i < data.length; i++) {
if ($ = cheerio.load(data[i])) {
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("null returned, do nothing");
} else {
name = entities.decodeHTML(name);
console.log(name);
}
} else {
console.log("can't open");
}
}
}, function(err) {
// error occurred here
});
});