从没有 class 或 id 的元素中刮取 link - casperjs
Scrape link from element that has no class or id - casperjs
我目前正在尝试从此 SITE 中抓取视频链接。链接位于轮播滑块内。 a
标签没有 class 或 id,因此我不确定如何定位以便 casperjs 可以检索 href
属性。我如何抓取没有 class 或 id 属性的链接?
HTML
<ul class="videos" >
<li>
<a href="http://www.test.com/article2/0,2817,2471677,00.asp">
<img src="http://assets1.ignimgs.com/thumbs/compact.jpg">
<span class="video-title">
<div> Fujitsu ScanSnap iX100 </div>
</span>
</a>
</li>
<li>
</li>
<li>
</li>
<li>
</li>
<li>
</li>
</ul>
JS
var urls = ['http://www.test.com/'];
var casper = require('casper').create({});
function linkScraper(x) {
var page_links = [];
for (var i = 0; i < x.length; i++) { // start for loop
current_page = x[i];
casper.thenOpen(x[i], function() {
casper.then(function() {
this.getElementsInfo('a').forEach(function(element) {
// skip elements that don't have a href attribute...
if (!element.attributes.href) {
return;
}
//page_links
page_links.push(element["attributes"]["href"]);
});
});
});
}
return {
pl: page_links,
};
}
//Crawl
function stringifyResult(webpages) {
// here linksObj contains empty lists
var linksObj = linkScraper.call(this, webpages);
this.then( function() {
var data = JSON.stringify( linksObj.pl );
this.echo(data.length + ' links found.');
});
}
casper.start().then(function() {
this.echo("Fetching........");
});
casper.run( stringifyResult.call(casper, urls) );
这是一个部分解决方案(我不了解 Casper)但想法是替换
this.getElementsInfo('a')
实际上用 class videos
得到 ul
本身。然后,您将必须按照逻辑导航到 li
,然后是 a
,然后拉出 link。即,不要刮 links;抓取您感兴趣并可以通过 class 识别的列表元素,然后从中提取 link 元素。
您可以通过 ul.videos li a
CSS 选择器:
获取所有需要的链接
function getLinks() {
var links = document.querySelectorAll('ul.videos li a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href')
});
}
完整的工作示例:
var casper = require('casper').create({}),
links;
function getLinks() {
var links = document.querySelectorAll('ul.videos li a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href')
});
}
casper.start('http://www.pcmag.com/video/latest');
casper.then(function() {
links = this.evaluate(getLinks);
});
casper.run(function() {
this.echo(links.length + ' links found:');
this.echo(' - ' + links.join('\n - ')).exit();
});
输出:
173 links found:
- http://www.pcmag.com/article2/0,2817,2470070,00.asp
- http://www.pcmag.com/article2/0,2817,2470084,00.asp
- http://www.pcmag.com/article2/0,2817,2470087,00.asp
...
- http://www.pcmag.com/article2/0,2817,2475543,00.asp
- http://www.pcmag.com/article2/0,2817,2475409,00.asp
- http://www.pcmag.com/article2/0,2817,2475359,00.asp
如何获取 span
文本示例:
function getSpanTexts() {
var texts = document.querySelectorAll('ul.videos li span.video-title');
return Array.prototype.map.call(texts, function(e) {
return e.textContent;
});
}
我目前正在尝试从此 SITE 中抓取视频链接。链接位于轮播滑块内。 a
标签没有 class 或 id,因此我不确定如何定位以便 casperjs 可以检索 href
属性。我如何抓取没有 class 或 id 属性的链接?
HTML
<ul class="videos" >
<li>
<a href="http://www.test.com/article2/0,2817,2471677,00.asp">
<img src="http://assets1.ignimgs.com/thumbs/compact.jpg">
<span class="video-title">
<div> Fujitsu ScanSnap iX100 </div>
</span>
</a>
</li>
<li>
</li>
<li>
</li>
<li>
</li>
<li>
</li>
</ul>
JS
var urls = ['http://www.test.com/'];
var casper = require('casper').create({});
function linkScraper(x) {
var page_links = [];
for (var i = 0; i < x.length; i++) { // start for loop
current_page = x[i];
casper.thenOpen(x[i], function() {
casper.then(function() {
this.getElementsInfo('a').forEach(function(element) {
// skip elements that don't have a href attribute...
if (!element.attributes.href) {
return;
}
//page_links
page_links.push(element["attributes"]["href"]);
});
});
});
}
return {
pl: page_links,
};
}
//Crawl
function stringifyResult(webpages) {
// here linksObj contains empty lists
var linksObj = linkScraper.call(this, webpages);
this.then( function() {
var data = JSON.stringify( linksObj.pl );
this.echo(data.length + ' links found.');
});
}
casper.start().then(function() {
this.echo("Fetching........");
});
casper.run( stringifyResult.call(casper, urls) );
这是一个部分解决方案(我不了解 Casper)但想法是替换
this.getElementsInfo('a')
实际上用 class videos
得到 ul
本身。然后,您将必须按照逻辑导航到 li
,然后是 a
,然后拉出 link。即,不要刮 links;抓取您感兴趣并可以通过 class 识别的列表元素,然后从中提取 link 元素。
您可以通过 ul.videos li a
CSS 选择器:
function getLinks() {
var links = document.querySelectorAll('ul.videos li a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href')
});
}
完整的工作示例:
var casper = require('casper').create({}),
links;
function getLinks() {
var links = document.querySelectorAll('ul.videos li a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href')
});
}
casper.start('http://www.pcmag.com/video/latest');
casper.then(function() {
links = this.evaluate(getLinks);
});
casper.run(function() {
this.echo(links.length + ' links found:');
this.echo(' - ' + links.join('\n - ')).exit();
});
输出:
173 links found:
- http://www.pcmag.com/article2/0,2817,2470070,00.asp
- http://www.pcmag.com/article2/0,2817,2470084,00.asp
- http://www.pcmag.com/article2/0,2817,2470087,00.asp
...
- http://www.pcmag.com/article2/0,2817,2475543,00.asp
- http://www.pcmag.com/article2/0,2817,2475409,00.asp
- http://www.pcmag.com/article2/0,2817,2475359,00.asp
如何获取 span
文本示例:
function getSpanTexts() {
var texts = document.querySelectorAll('ul.videos li span.video-title');
return Array.prototype.map.call(texts, function(e) {
return e.textContent;
});
}