如何在 phantomjs 中创建实体?

How to create an entity in phantomjs?

我正在尝试为 Google 搜索结果编写抓取器。这是我写的:

var system = require('system');
var args = system.args;
var webPage = require('webpage');
var page = webPage.create();

var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');

page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)]; 

var request = "search?q=",
    newPage = "&start=",
    localInfo,
    depth;

var gUrl = 'http://google.com/',
    yaUrl = 'http://yahoo.com/',
    url = '';

var searchPages = [],
    links;

    request += args[1];
    url += gUrl + request;
    depth = args[2];

    function pageHandler(uri) {
        page.open(uri, function (status) {
            if (status === 'success') {

                page.injectJs('./libs/jquery-2.1.3.min.js');

                links = page.evaluate(function () {
                    return $("li.g h3 a").map(function () {
                        return this.href;
                    }).get();
                });
                localInfo = page.evaluate(function() {
                   return $("#swml_addr").text();
                });
                console.log(links.join('\n'));
                console.log(links.length);
                console.log(localInfo);
                setTimeout(nextPage, 1000);
            }
        });
    }

    function prepareSearchPages() {
        for (var numPage = 0; numPage < depth; numPage++) {
            url += newPage+10*numPage;
            searchPages.push(url);
            url = url.substr(0,url.indexOf(newPage));
        }
        nextPage();
    }

    var nextPage = function() {
        var file = searchPages.shift();
        if(!file) phantom.exit();
        pageHandler(file);
    };


    prepareSearchPages();

很可能所有人都认为它看起来很糟糕。但效果很好。所以,我决定为搜索引擎分配一个单独的实体

var webPage = require('webpage');
var page = webPage.create();


function searchEngine(engConfig) {
    var _engineUrl = engConfig.rootDomain;
    var _engineRequest = engConfig.requestPrefix;
    var _engineNewPage = engConfig.newPagePrefix;
    var _linkWrapperSelector = engConfig.linkWrapperSelector;
    var _locSelector = engConfig.locSelector;
    var _localInfo;
    var _searchPagesUrls = [];
    var _resultLinks;

    var pageHandler = function(uri) {
        page.open(uri, function (status) {
            if (status === 'success') {

                page.injectJs('./libs/jquery-2.1.3.min.js');

                _resultLinks = page.evaluate(function(_linkWrapperSelector) {
                    return $(_linkWrapperSelector).map(function () {
                        return this.href;
                    }).get();
                });
                _localInfo = page.evaluate(function(_locSelector) {
                    return $(_locSelector).text();
                });
                console.log(_resultLinks.join('\n'));
                console.log(_resultLinks.length);
                console.log(_localInfo);

                setTimeout(nextPage, 1000);
            }
        });
    };

    var nextPage = function() {
        var file = _searchPagesUrls.shift();
        if(!file) phantom.exit();
        pageHandler(file);
    };

    this.runSearch = function(keyPhrase, depthSearch) {
        var url = _engineUrl+_engineRequest+keyPhrase;
        for (var numPage = 0; numPage < depthSearch; numPage++) {
            url += _engineNewPage+10*numPage;
            _searchPagesUrls.push(url);
            url = url.substr(0,url.indexOf(_engineNewPage));
        }
        nextPage();
    };

    this.showLinks = function() {
        return _resultLinks.join('\n');
    };


}

var googleOptions = {
    rootDomain: 'http://google.ru/',
    requestPrefix: 'search?q=',
    newPagePrefix: '&start=',
    linkWrapperSelector: 'li.g h3 a',
    locSelector: '#swml_addr'
};

var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');

page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];

var google = new searchEngine(googleOptions);

google.runSearch('Hello', 1);

不幸的是,它不起作用。我不明白为什么。也许我搞砸了范围界定。

P.S。此代码的第一个版本可以正常工作并在控制台中显示所有链接。第二版代码仅输出 0,但将 uri 传递给 pageHandler 函数是正确的。甚至不显示 'undefined' 或类似的内容。

page.evaluate() 是沙盒页面上下文。它无权访问在其外部定义的变量。您必须明确地将 _linkWrapperSelector 传递给它:

_resultLinks = page.evaluate(function(_linkWrapperSelector) {
    return $(_linkWrapperSelector).map(function () {
        return this.href;
    }).get();
}, _linkWrapperSelector); // this here

_locSelector也是如此:

_localInfo = page.evaluate(function(_locSelector) {
    return $(_locSelector).text();
}, _locSelector); // this here