从没有 class 或 id 的元素中刮取 link - casperjs

Scrape link from element that has no class or id - casperjs

我目前正在尝试从此 SITE 中抓取视频链接。链接位于轮播滑块内。 a 标签没有 class 或 id,因此我不确定如何定位以便 casperjs 可以检索 href 属性。我如何抓取没有 class 或 id 属性的链接?

HTML

<ul class="videos" >
    <li>
        <a href="http://www.test.com/article2/0,2817,2471677,00.asp">
            <img src="http://assets1.ignimgs.com/thumbs/compact.jpg">
            <span class="video-title">
                <div> Fujitsu ScanSnap iX100 </div>
            </span>
        </a>
    </li>
    <li>
    </li>
    <li>
    </li>
    <li>
    </li>
    <li>
    </li>
</ul>

JS

var urls = ['http://www.test.com/'];
var casper = require('casper').create({});

function linkScraper(x) {
    var page_links = [];
    for (var i = 0; i < x.length; i++) { // start for loop
        current_page = x[i];
        casper.thenOpen(x[i], function() {
            casper.then(function() {
                this.getElementsInfo('a').forEach(function(element) {
                    // skip elements that don't have a href attribute...
                    if (!element.attributes.href) {
                        return;
                    }
                    //page_links
                    page_links.push(element["attributes"]["href"]);
                });
            });
        });
    }
    return {
        pl: page_links,
    };
}

//Crawl
function stringifyResult(webpages) {
    // here linksObj contains empty lists
    var linksObj = linkScraper.call(this, webpages);
    this.then( function() {
        var data = JSON.stringify( linksObj.pl );
        this.echo(data.length + ' links found.');
    });
}

casper.start().then(function() {
    this.echo("Fetching........");
});
casper.run( stringifyResult.call(casper, urls) );

这是一个部分解决方案(我不了解 Casper)但想法是替换

this.getElementsInfo('a')

实际上用 class videos 得到 ul 本身。然后,您将必须按照逻辑导航到 li,然后是 a,然后拉出 link。即,不要刮 links;抓取您感兴趣并可以通过 class 识别的列表元素,然后从中提取 link 元素。

您可以通过 ul.videos li a CSS 选择器:

获取所有需要的链接
function getLinks() {
    var links = document.querySelectorAll('ul.videos li a');
    return Array.prototype.map.call(links, function(e) {
        return e.getAttribute('href')
    });
}

完整的工作示例:

var casper = require('casper').create({}),
    links;

function getLinks() {
    var links = document.querySelectorAll('ul.videos li a');
    return Array.prototype.map.call(links, function(e) {
        return e.getAttribute('href')
    });
}

casper.start('http://www.pcmag.com/video/latest');

casper.then(function() {
    links = this.evaluate(getLinks);
});

casper.run(function() {
    this.echo(links.length + ' links found:');
    this.echo(' - ' + links.join('\n - ')).exit();
});

输出:

173 links found:
 - http://www.pcmag.com/article2/0,2817,2470070,00.asp
 - http://www.pcmag.com/article2/0,2817,2470084,00.asp
 - http://www.pcmag.com/article2/0,2817,2470087,00.asp
...
 - http://www.pcmag.com/article2/0,2817,2475543,00.asp
 - http://www.pcmag.com/article2/0,2817,2475409,00.asp
 - http://www.pcmag.com/article2/0,2817,2475359,00.asp

如何获取 span 文本示例:

function getSpanTexts() {
    var texts = document.querySelectorAll('ul.videos li span.video-title');
    return Array.prototype.map.call(texts, function(e) {
        return e.textContent;
    });
}