Web Crawler - 返回一个数组以在下一个函数中使用
Web Crawler - returning an array to be used in the next function
我可以确认的第一个函数可以正常工作。
我想 return 一个数组到变量 AtoZLinks 以便它可以在后面的函数中使用。我将向数组中的每个 url 发出请求,并从这些链接中提取更多信息。
非常感谢,我已经将这个项目作为一个项目工作了几天,我是 jQuery、网络爬虫、JS、NodeJS 和 expressJS 的初学者。为工作埋头苦干
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var router = express.Router();
var fullUrl;
fullUrl = [];
var AtoZLinks = function(){
var url = 'http://example1.com';
request(url, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var fullUrl = [];
var places = "Places";
$('.clear a').each(function() {
var link = $(this);
link.each(function(index) {
var href = link.attr('href');
if (href.match(places)) {
// The urls from fullUrl here to be returned to parent variable.
fullUrl[index] = url + href; // Think something is wrong here... I've also tried "fullUrl.push(url + href);"
console.log(fullUrl); // This prints out all urls correctly
}
});
});
for (var i = 0; i < fullUrl.length; i++) {
console.log(fullUrl[i];
} // This code only prints out the last url stored (So I'm thinking the urls are being stored incorrectly...)
}
});
};
/* GET crawler page. */
router.get('/crawler', function(req, res, next) {
AtoZLinks();
next();
}, function(req, res) {
});
module.exports = router;
// Feel free to ignore the following work I've done or..
// Your support with the the following function will be a bonus!
// I need to use the links in the previous array variable in the following
// function to extract further urls within those urls that I will work with.
var url = AtoZLinks;
request(AtoZLinks, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
// This selector is the code needed to extract the links from within the
// links in the AtoZLinks array.
$('div.fclist-section.clear.list-four').each(function() {
$(this).find('a').each(function() {
var link = $(this);
var href = link.attr('href');
fullUrl = url + href;
console.log(fullUrl);
});
});
}
);
你的意思是这样的吗?
var arrURLs;
arrURLs = [
'www.ask.com',
'www.google.com',
'www.bing.com',
'www.yahoo.com'
];
AtoZLinks(arrURLs);
var AtoZLinks = function(theURLs){
for (var i = 0; i < theURLs.length; i++) {
var url = theURLs[i];
request(url, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var fullUrl = [];
var places = "Places";
$('.clear a').each(function() {
var link = $(this);
link.each(function(index) {
var href = link.attr('href');
//absolute match
if (href === url) {
//true
} else {
//false
}
//href contains url
if (href.indexOf(url) > -1) {
//true
} else {
//false
}
if (href.match(places)) {
// The urls from fullUrl here to be returned to parent variable.
fullUrl.push(url + href);
console.log(JSON.stringify(fullUrl));
}
});
});
}
});
}
};
我可以确认的第一个函数可以正常工作。
我想 return 一个数组到变量 AtoZLinks 以便它可以在后面的函数中使用。我将向数组中的每个 url 发出请求,并从这些链接中提取更多信息。
非常感谢,我已经将这个项目作为一个项目工作了几天,我是 jQuery、网络爬虫、JS、NodeJS 和 expressJS 的初学者。为工作埋头苦干
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var router = express.Router();
var fullUrl;
fullUrl = [];
var AtoZLinks = function(){
var url = 'http://example1.com';
request(url, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var fullUrl = [];
var places = "Places";
$('.clear a').each(function() {
var link = $(this);
link.each(function(index) {
var href = link.attr('href');
if (href.match(places)) {
// The urls from fullUrl here to be returned to parent variable.
fullUrl[index] = url + href; // Think something is wrong here... I've also tried "fullUrl.push(url + href);"
console.log(fullUrl); // This prints out all urls correctly
}
});
});
for (var i = 0; i < fullUrl.length; i++) {
console.log(fullUrl[i];
} // This code only prints out the last url stored (So I'm thinking the urls are being stored incorrectly...)
}
});
};
/* GET crawler page. */
router.get('/crawler', function(req, res, next) {
AtoZLinks();
next();
}, function(req, res) {
});
module.exports = router;
// Feel free to ignore the following work I've done or..
// Your support with the the following function will be a bonus!
// I need to use the links in the previous array variable in the following
// function to extract further urls within those urls that I will work with.
var url = AtoZLinks;
request(AtoZLinks, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
// This selector is the code needed to extract the links from within the
// links in the AtoZLinks array.
$('div.fclist-section.clear.list-four').each(function() {
$(this).find('a').each(function() {
var link = $(this);
var href = link.attr('href');
fullUrl = url + href;
console.log(fullUrl);
});
});
}
);
你的意思是这样的吗?
var arrURLs;
arrURLs = [
'www.ask.com',
'www.google.com',
'www.bing.com',
'www.yahoo.com'
];
AtoZLinks(arrURLs);
var AtoZLinks = function(theURLs){
for (var i = 0; i < theURLs.length; i++) {
var url = theURLs[i];
request(url, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var fullUrl = [];
var places = "Places";
$('.clear a').each(function() {
var link = $(this);
link.each(function(index) {
var href = link.attr('href');
//absolute match
if (href === url) {
//true
} else {
//false
}
//href contains url
if (href.indexOf(url) > -1) {
//true
} else {
//false
}
if (href.match(places)) {
// The urls from fullUrl here to be returned to parent variable.
fullUrl.push(url + href);
console.log(JSON.stringify(fullUrl));
}
});
});
}
});
}
};