Casper 抓取在单个 javascript 页面上加载的多页

Casper scrape multipages that load on a single javascript based page

循环 Link 不允许从他们的 website/database 抓取到其他数据库 这只是出于学术目的,在动态加载页面中进行无头浏览器测试。请不要滥用他们的数据。

基本上,我只是想从多个页面获取图像 link,但是这些页面是通过 java 脚本动态加载的。所以我最终尝试了基于 phantomjs.

的无头浏览器 casperjs

现在我的问题是函数 getThumbNails() 输出相同的数据两次。这是输出日志,请注意第一个 '1' 和第一个 '2' 相同 link.

PuffMagicDragon@SuperDankLinux:~/WeB$ casperjs --web-security=no --cookies-file=/tmp/mycookies.txt 9Cas.js 
First Page Is Loaded
Second Page Is Loaded
1 http://x.lnimg.com/photo/thumb_480/40f175f108f5492b9cdec6486d753f8d.jpg
1 http://x.lnimg.com/photo/thumb_480/29081ed96a6349a08c27424ce3bd2842.jpg
1 http://x.lnimg.com/photo/thumb_480/29cd278e7cc34d9782d0a22782af2134.jpg
1 http://x.lnimg.com/photo/thumb_480/3979dc0f0987407bb9f825f2a0cb3fa9.jpg
1 http://x.lnimg.com/photo/thumb_480/dd06239abbf1433099ad3278607e5d7f.jpg
1 http://x.lnimg.com/photo/thumb_480/6e7c6b6076d5414b8ee59baed3dc3131.jpg
1 http://x.lnimg.com/photo/thumb_480/97027946bbf745a59d44ac1c3e9d22fe.jpg
1 http://x.lnimg.com/photo/thumb_480/396fd224e85f42aea7a10e1873ed627c.jpg
1 http://x.lnimg.com/photo/thumb_480/62f6afc3f50942388df9fe66e99a2ab4.jpg
1 http://x.lnimg.com/photo/thumb_480/09ba5b97da4e47b1a97bac86e125001d.jpg
1 http://x.lnimg.com/photo/thumb_480/580dcc66cd7f48d8aae8f583cd8e5e4b.jpg
1 http://x.lnimg.com/photo/thumb_480/e62f6d11449b41ff93e191f6045cb304.jpg
1 http://x.lnimg.com/photo/thumb_480/596230f07ebf471383991a99bd43420a.jpg
1 http://x.lnimg.com/photo/thumb_480/da63be695fed4617b594d19e4aa0dc7a.jpg
1 http://x.lnimg.com/photo/thumb_480/58f28244a2494c868006a97534e694e6.jpg
1 http://x.lnimg.com/photo/thumb_480/1b0246dfe4314404b32147061198cf80.jpg
1 http://x.lnimg.com/photo/thumb_480/b10a9bb8252f4d3d9638284f6f68fd6b.jpg
1 http://x.lnimg.com/photo/thumb_480/48bf185da5f74a369dd629dd183bc8b3.jpg
1 http://x.lnimg.com/photo/thumb_480/7008392330f847ecb3af7058ce0b2e3b.jpg
1 http://x.lnimg.com/photo/thumb_480/fcf5618fc7d04265a912cce9f7dec344.jpg
1 //x.lnimg.com/images/search/map/pinprofileclose2.gif
1 
1 //x.lnimg.com/images/search/map/pinprofile-ViewDetails.gif
1 http://ak.t0.tiles.virtualearth.net/tiles/r0?g=5289&mkt=en-us&lbl=l0&stl=h&shading=hill&n=z
1 http://ak.dynamic.t1.tiles.virtualearth.net/comp/ch/0231?mkt=en-us&it=G,L&shading=hill&og=132&n=z&key=AhfWRPlwZlVGewL9th5vsIklmzvMkUMm8wx6pz5nXzgDlYXliZFSr6lBvgHYCZQj
2 http://x.lnimg.com/photo/thumb_480/40f175f108f5492b9cdec6486d753f8d.jpg
2 http://x.lnimg.com/photo/thumb_480/29081ed96a6349a08c27424ce3bd2842.jpg
2 http://x.lnimg.com/photo/thumb_480/29cd278e7cc34d9782d0a22782af2134.jpg
2 http://x.lnimg.com/photo/thumb_480/3979dc0f0987407bb9f825f2a0cb3fa9.jpg
2 http://x.lnimg.com/photo/thumb_480/dd06239abbf1433099ad3278607e5d7f.jpg
2 http://x.lnimg.com/photo/thumb_480/6e7c6b6076d5414b8ee59baed3dc3131.jpg
2 http://x.lnimg.com/photo/thumb_480/97027946bbf745a59d44ac1c3e9d22fe.jpg
2 http://x.lnimg.com/photo/thumb_480/396fd224e85f42aea7a10e1873ed627c.jpg
2 http://x.lnimg.com/photo/thumb_480/62f6afc3f50942388df9fe66e99a2ab4.jpg
2 http://x.lnimg.com/photo/thumb_480/09ba5b97da4e47b1a97bac86e125001d.jpg
2 http://x.lnimg.com/photo/thumb_480/580dcc66cd7f48d8aae8f583cd8e5e4b.jpg
2 http://x.lnimg.com/photo/thumb_480/e62f6d11449b41ff93e191f6045cb304.jpg
2 http://x.lnimg.com/photo/thumb_480/596230f07ebf471383991a99bd43420a.jpg
2 http://x.lnimg.com/photo/thumb_480/da63be695fed4617b594d19e4aa0dc7a.jpg
2 http://x.lnimg.com/photo/thumb_480/58f28244a2494c868006a97534e694e6.jpg
2 http://x.lnimg.com/photo/thumb_480/1b0246dfe4314404b32147061198cf80.jpg
2 http://x.lnimg.com/photo/thumb_480/b10a9bb8252f4d3d9638284f6f68fd6b.jpg
2 http://x.lnimg.com/photo/thumb_480/48bf185da5f74a369dd629dd183bc8b3.jpg
2 http://x.lnimg.com/photo/thumb_480/7008392330f847ecb3af7058ce0b2e3b.jpg
2 http://x.lnimg.com/photo/thumb_480/fcf5618fc7d04265a912cce9f7dec344.jpg
2 //x.lnimg.com/images/search/map/pinprofileclose2.gif
2 
2 //x.lnimg.com/images/search/map/pinprofile-ViewDetails.gif
2 http://ak.t0.tiles.virtualearth.net/tiles/r0?g=5289&mkt=en-us&lbl=l0&stl=h&shading=hill&n=z
2 http://ak.dynamic.t1.tiles.virtualearth.net/comp/ch/0231?mkt=en-us&it=G,L&shading=hill&og=132&n=z&key=AhfWRPlwZlVGewL9th5vsIklmzvMkUMm8wx6pz5nXzgDlYXliZFSr6lBvgHYCZQj

这是文档中所说的评估工作方式:Casperjs Evaluate Diagram

这是我正在抓取的网站:http://looplink.ensemblere.com/SearchResults

我是 运行 通过 bash 在 linux 使用的脚本:

casperjs --web-security=no --cookies-file=/tmp/mycookies.txt Script.js
var casper = require('casper').create({
    verbose: true,
});

//two different vars for two different pages of images
var thumbNails, thumbNails2;

function getThumbNails() {
// Function to Scrape the links of thumbnails
    var thumbNails = document.querySelectorAll('img');
    return Array.prototype.map.call(thumbNails, function (e) {
        return e.getAttribute('src');
    });
};

// Scrubs target page below
casper.start('http://looplink.ensemblere.com/SearchResults');

//Runs the function on page one, below
casper.then(function () {
    thumbNails = this.evaluate(getThumbNails);
});

//Confirm that we are on page one of website, below
casper.then(function (){
    if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
        this.echo('Second Page Is Loaded');
    }
    else {
        this.echo('First Page Is Loaded');
    }
});

//Click on the next button to go to page two, below
casper.thenClick('a.searchPagingButton.pagingNextButton');

//Confirm that we are on page two, below
casper.then(function (){
    if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
        this.echo('Second Page Is Loaded')
    }
    else {
        this.echo('First Page Is Loaded')
    }
});

//Runs the funciton for the second time, below; BUT HERE IS THE ISSUE *****
//idk why it pulls the same data twice, when the then statement above confirms we are on page two.
casper.then(function () {
    thumbNails2 = this.evaluate(getThumbNails);
});

//Sort the data from the varaiables.
casper.run(function () {
    for(var i in thumbNails) {
        console.log('1 ' + thumbNails[i]);
    }
    for(var i in thumbNails2) {
        console.log('2 ' + thumbNails2[i]);
    }
    this.done();
});

好的,请在评论中支持 Artjom B. 告诉我截屏以确保页面已加载!

资源正在加载,即:still loading picture

通过添加

this.wait(40000, function() {
    this.echo('Waited for 40 seconds');
});

casper 将等待 40 秒

截图调试简单试试this.capture(/home/SuperDankDude/yourfilename.png)

我还注意到该网站的移动版本与全屏版本不同,所以我不得不也使用 casper.options.viewportSize = {width: 1920, height: 1080};

var casper = require('casper').create({
    verbose: true,
});

//two different vars for two different pages of images
var thumbNails, thumbNails2;

casper.options.viewportSize = {width: 1920, height: 1080};

function getThumbNails() {
// Function to Scrape the links of thumbnails
    var thumbNails = document.querySelectorAll('img');
    return Array.prototype.map.call(thumbNails, function (e) {
        return e.getAttribute('src');
    });
};

// Scrubs target page below
casper.start('http://looplink.ensemblere.com/SearchResults');

//Runs the function on page one, below
casper.then(function () {
    thumbNails = this.evaluate(getThumbNails);
});

//Confirm that we are on page one of website, below
casper.then(function (){
    if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
        this.echo('Second Page Is Loaded');
    }
    else {
        this.echo('First Page Is Loaded')
        this.capture("/home/votlon/WeB/firstpage.png");
    }
});

//Click on the next button to go to page two, below
casper.thenClick('a.searchPagingButton.pagingNextButton');

//Confirm that we are on page two, below
casper.then(function (){
    if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
        this.echo('Second Page Is Loading')
        this.wait(40000, function() {
        this.echo('Waited for 40 seconds');
    });
        this.capture("/home/votlon/WeB/secondpage.png");
    }
    else {
        this.echo('First Page Is Loaded')
    }
});

//Runs the funciton for the second time, below; BUT HERE IS THE ISSUE *****
//idk why it pulls the same data twice, when the then statement above confirms we are on page two.
casper.then(function () {
    thumbNails2 = this.evaluate(getThumbNails);
});

//Sort the data from the varaiables.
casper.run(function () {
    for(var i in thumbNails) {
        console.log('1 ' + thumbNails[i]);
    }
    for(var i in thumbNails2) {
        console.log('2 ' + thumbNails2[i]);
    }
    this.done();
});