如何使用 Node.js 扫描网站并构建站点地图
How to scan a website and build a sitemap using Node.js
我正在尝试使用 Node.js 获取网站的站点地图。谁能指出我该怎么做?
我目前正在查看 https://github.com/cgiffard/node-simplecrawler,但不确定如何阻止它实际抓取页面。我只需要链接,并且可能在结构化对象中...
我希望你明白了!
干杯,
H.
我发现了一个非常有用的命令行工具,它是用 node 编写的。我发现它的源代码在那个任务中非常有用。这是包存储库的 link:https://github.com/lgraubner/node-sitemap-generator-cli
这是我最终使用的代码:
var Crawler = require('simplecrawler');
var port = 80;
var exclude = ['gif', 'jpg', 'jpeg', 'png', 'ico', 'bmp', 'ogg', 'webp',
'mp4', 'webm', 'mp3', 'ttf', 'woff', 'json', 'rss', 'atom', 'gz', 'zip',
'rar', '7z', 'css', 'js', 'gzip', 'exe'];
var exts = exclude.join('|');
var regex = new RegExp('\.(' + exts + ')', 'i'); // This is used for filtering crawl items.
var crawler = new Crawler('www.website.com');
var pages = []; // This array will hold all the URLs
// Crawler configuration
crawler.initialPort = port;
crawler.initalPath = '/';
crawler.addFetchCondition(function (parsedURL) {
return !parsedURL.path.match(regex); // This will reject anything that's not a link.
});
// Run the crawler
crawler.start();
crawler.on('fetchcomplete', function(item, responseBuffer, response) {
pages.push(item.url); // Add URL to the array of pages
});
我不确定,但我对那个工具不满意,
我使用爬虫程序包编写了一些代码,它自动构建了一个站点地图
var Crawler = require("crawler");
var url = require('url');
var fs = require('fs');
var writeStream = fs.createWriteStream('./output');
writeStream.write('<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation=" http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">');
var strBuff = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">';
var Router = require('routes');
var router = Router();
var noop = function(){};
var peroid = {
'/':'hourly',
'/results':'hourly',
'/tips': 'hourly',
'/tips/:country':'hourly',
'/tips/:country/:venue':'hourly',
'/support':'hourly',
}
function addToMap(url) {
var key = router.match(url.replace('https://www.yourwebsite.com',''));
if(!key) {
key = {};
key.route = '/';
} else {
console.log('match ', url);
}
var route = key.route;
var freq = peroid[route];
var buf = '<url>\n<loc>'+url+'</loc>\n <changefreq>'+freq+'</changefreq>\n<priority>0.5</priority>\n</url>';
strBuff += '<url>\n<loc>'+url+'</loc>\n <changefreq>'+freq+'</changefreq>\n<priority>0.5</priority>\n</url>';
writeStream.write(buf);
}
function saveTofile() {
console.log('end');
writeStream.write('\n</urlset>');
writeStream.end();
}
router.addRoute("/", noop);
router.addRoute("/tips", noop);
router.addRoute("/tips/:country", noop);
router.addRoute("/tips/:country/:venue", noop);
router.addRoute("/support", noop);
router.addRoute("/algorithm", noop);
var cache = {};
var c = new Crawler({
maxConnections : 25,
skipDuplicates: true,
// This will be called for each crawled page
onDrain: function () {
console.log('ondrain');
saveTofile();
},
callback : function (error, result, $) {
if(error || !$) {
console.log(error, result.uri);
return;
}
$('a').each(function(index, a) {
var toQueueUrl = $(a).attr('href');
if(!toQueueUrl) {
return;
}
if((toQueueUrl && toQueueUrl[0] !== '/') || toQueueUrl.indexOf('/api/') !== -1 || toQueueUrl.indexOf('.pdf') !== -1) {
//console.log('not crawliing', toQueueUrl);
return;
}
if(cache.hasOwnProperty(toQueueUrl) || !toQueueUrl) {
return;
}
//console.log(toQueueUrl);
c.queue('https://www.yourwebsite.com'+toQueueUrl);
addToMap('https://www.yourwebsite.com'+toQueueUrl);
cache[toQueueUrl] = 1;
var keyz = Object.keys(cache);
if(! (keyz.length % 100) ) {
console.log('total', keyz.length);
}
});
}
});
c.queue('https://www.yourwebsite.com');
希望对你有帮助
我正在尝试使用 Node.js 获取网站的站点地图。谁能指出我该怎么做?
我目前正在查看 https://github.com/cgiffard/node-simplecrawler,但不确定如何阻止它实际抓取页面。我只需要链接,并且可能在结构化对象中...
我希望你明白了!
干杯, H.
我发现了一个非常有用的命令行工具,它是用 node 编写的。我发现它的源代码在那个任务中非常有用。这是包存储库的 link:https://github.com/lgraubner/node-sitemap-generator-cli
这是我最终使用的代码:
var Crawler = require('simplecrawler');
var port = 80;
var exclude = ['gif', 'jpg', 'jpeg', 'png', 'ico', 'bmp', 'ogg', 'webp',
'mp4', 'webm', 'mp3', 'ttf', 'woff', 'json', 'rss', 'atom', 'gz', 'zip',
'rar', '7z', 'css', 'js', 'gzip', 'exe'];
var exts = exclude.join('|');
var regex = new RegExp('\.(' + exts + ')', 'i'); // This is used for filtering crawl items.
var crawler = new Crawler('www.website.com');
var pages = []; // This array will hold all the URLs
// Crawler configuration
crawler.initialPort = port;
crawler.initalPath = '/';
crawler.addFetchCondition(function (parsedURL) {
return !parsedURL.path.match(regex); // This will reject anything that's not a link.
});
// Run the crawler
crawler.start();
crawler.on('fetchcomplete', function(item, responseBuffer, response) {
pages.push(item.url); // Add URL to the array of pages
});
我不确定,但我对那个工具不满意, 我使用爬虫程序包编写了一些代码,它自动构建了一个站点地图
var Crawler = require("crawler");
var url = require('url');
var fs = require('fs');
var writeStream = fs.createWriteStream('./output');
writeStream.write('<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation=" http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">');
var strBuff = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">';
var Router = require('routes');
var router = Router();
var noop = function(){};
var peroid = {
'/':'hourly',
'/results':'hourly',
'/tips': 'hourly',
'/tips/:country':'hourly',
'/tips/:country/:venue':'hourly',
'/support':'hourly',
}
function addToMap(url) {
var key = router.match(url.replace('https://www.yourwebsite.com',''));
if(!key) {
key = {};
key.route = '/';
} else {
console.log('match ', url);
}
var route = key.route;
var freq = peroid[route];
var buf = '<url>\n<loc>'+url+'</loc>\n <changefreq>'+freq+'</changefreq>\n<priority>0.5</priority>\n</url>';
strBuff += '<url>\n<loc>'+url+'</loc>\n <changefreq>'+freq+'</changefreq>\n<priority>0.5</priority>\n</url>';
writeStream.write(buf);
}
function saveTofile() {
console.log('end');
writeStream.write('\n</urlset>');
writeStream.end();
}
router.addRoute("/", noop);
router.addRoute("/tips", noop);
router.addRoute("/tips/:country", noop);
router.addRoute("/tips/:country/:venue", noop);
router.addRoute("/support", noop);
router.addRoute("/algorithm", noop);
var cache = {};
var c = new Crawler({
maxConnections : 25,
skipDuplicates: true,
// This will be called for each crawled page
onDrain: function () {
console.log('ondrain');
saveTofile();
},
callback : function (error, result, $) {
if(error || !$) {
console.log(error, result.uri);
return;
}
$('a').each(function(index, a) {
var toQueueUrl = $(a).attr('href');
if(!toQueueUrl) {
return;
}
if((toQueueUrl && toQueueUrl[0] !== '/') || toQueueUrl.indexOf('/api/') !== -1 || toQueueUrl.indexOf('.pdf') !== -1) {
//console.log('not crawliing', toQueueUrl);
return;
}
if(cache.hasOwnProperty(toQueueUrl) || !toQueueUrl) {
return;
}
//console.log(toQueueUrl);
c.queue('https://www.yourwebsite.com'+toQueueUrl);
addToMap('https://www.yourwebsite.com'+toQueueUrl);
cache[toQueueUrl] = 1;
var keyz = Object.keys(cache);
if(! (keyz.length % 100) ) {
console.log('total', keyz.length);
}
});
}
});
c.queue('https://www.yourwebsite.com');
希望对你有帮助