如何使用节点获取给定 url 的所有元素的 css?
How to get css of all elements given a url using node?
我正在尝试编写一个脚本,该脚本在提供站点 url 时删除所有链接,并检查是否在每个 url 中该页面上使用的字体是否为 helvetica
,所以我想出了下面的脚本(从网上复制的部分)。
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var START_URL = "http://example.com/";
var SEARCH_FONT = "helvetica";
var MAX_PAGES_TO_VISIT = 100000;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
var helveticaFound = searchForHelvetica($, SEARCH_FONT);
if(helveticaFound) {
console.log('Word ' + SEARCH_FONT + ' found at page ' + url);
} else {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
callback();
}
});
}
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log($(e).css('fontFamily')) } );
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
}
我面临的问题在以下函数中:
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log($(e).css('fontFamily')) } );
}
第 console.log($(e).css('fontFamily'))
行将始终 return undefined
。我相信 cheerio 无法访问元素的 css 。我该如何解决这个问题?我如何获取页面上每个元素的 css
,通过每个元素 运行,然后检查页面上的任何地方是否使用了 helvetica
字体和 return true 或基于此测试为假 ?
你必须利用getComputedStyle
见https://developer.mozilla.org/en-US/docs/Web/API/Window/getComputedStyle
类似于:
window.getComputedStyle(node, null).getPropertyValue('font-family');
它将return一个字符串(如"Arial, "Helvetica Neue", Helvetica, sans-serif"
),您可以在其中搜索字体
鉴于你的例子,我认为你可以这样做:
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log(window.getComputedStyle($(e)[0], null).getPropertyValue('font-family')) } );
}
注:
正如 cheerio README 中明确指出的那样:
Cheerio parses markup and provides an API for traversing/manipulating the resulting data structure. It does not interpret the result as a web browser does. Specifically, it does not produce a visual rendering, apply CSS, load external resources, or execute JavaScript. If your use case requires any of this functionality, you should consider projects like PhantomJS or JSDom.
cheerio 没有 render/apply CSS。
所以你应该使用 jsdom (https://github.com/jsdom/jsdom) 因为它支持 getComputedStyle
.
我正在尝试编写一个脚本,该脚本在提供站点 url 时删除所有链接,并检查是否在每个 url 中该页面上使用的字体是否为 helvetica
,所以我想出了下面的脚本(从网上复制的部分)。
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var START_URL = "http://example.com/";
var SEARCH_FONT = "helvetica";
var MAX_PAGES_TO_VISIT = 100000;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
var helveticaFound = searchForHelvetica($, SEARCH_FONT);
if(helveticaFound) {
console.log('Word ' + SEARCH_FONT + ' found at page ' + url);
} else {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
callback();
}
});
}
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log($(e).css('fontFamily')) } );
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
}
我面临的问题在以下函数中:
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log($(e).css('fontFamily')) } );
}
第 console.log($(e).css('fontFamily'))
行将始终 return undefined
。我相信 cheerio 无法访问元素的 css 。我该如何解决这个问题?我如何获取页面上每个元素的 css
,通过每个元素 运行,然后检查页面上的任何地方是否使用了 helvetica
字体和 return true 或基于此测试为假 ?
你必须利用getComputedStyle
见https://developer.mozilla.org/en-US/docs/Web/API/Window/getComputedStyle
类似于:
window.getComputedStyle(node, null).getPropertyValue('font-family');
它将return一个字符串(如"Arial, "Helvetica Neue", Helvetica, sans-serif"
),您可以在其中搜索字体
鉴于你的例子,我认为你可以这样做:
function searchForHelvetica($, word) {
var bodyText = $('*').each( function(i , e) { console.log(window.getComputedStyle($(e)[0], null).getPropertyValue('font-family')) } );
}
注:
正如 cheerio README 中明确指出的那样:
Cheerio parses markup and provides an API for traversing/manipulating the resulting data structure. It does not interpret the result as a web browser does. Specifically, it does not produce a visual rendering, apply CSS, load external resources, or execute JavaScript. If your use case requires any of this functionality, you should consider projects like PhantomJS or JSDom.
cheerio 没有 render/apply CSS。
所以你应该使用 jsdom (https://github.com/jsdom/jsdom) 因为它支持 getComputedStyle
.