如何在 phantomjs 中创建实体?
How to create an entity in phantomjs?
我正在尝试为 Google 搜索结果编写抓取器。这是我写的:
var system = require('system');
var args = system.args;
var webPage = require('webpage');
var page = webPage.create();
var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
var request = "search?q=",
newPage = "&start=",
localInfo,
depth;
var gUrl = 'http://google.com/',
yaUrl = 'http://yahoo.com/',
url = '';
var searchPages = [],
links;
request += args[1];
url += gUrl + request;
depth = args[2];
function pageHandler(uri) {
page.open(uri, function (status) {
if (status === 'success') {
page.injectJs('./libs/jquery-2.1.3.min.js');
links = page.evaluate(function () {
return $("li.g h3 a").map(function () {
return this.href;
}).get();
});
localInfo = page.evaluate(function() {
return $("#swml_addr").text();
});
console.log(links.join('\n'));
console.log(links.length);
console.log(localInfo);
setTimeout(nextPage, 1000);
}
});
}
function prepareSearchPages() {
for (var numPage = 0; numPage < depth; numPage++) {
url += newPage+10*numPage;
searchPages.push(url);
url = url.substr(0,url.indexOf(newPage));
}
nextPage();
}
var nextPage = function() {
var file = searchPages.shift();
if(!file) phantom.exit();
pageHandler(file);
};
prepareSearchPages();
很可能所有人都认为它看起来很糟糕。但效果很好。所以,我决定为搜索引擎分配一个单独的实体
var webPage = require('webpage');
var page = webPage.create();
function searchEngine(engConfig) {
var _engineUrl = engConfig.rootDomain;
var _engineRequest = engConfig.requestPrefix;
var _engineNewPage = engConfig.newPagePrefix;
var _linkWrapperSelector = engConfig.linkWrapperSelector;
var _locSelector = engConfig.locSelector;
var _localInfo;
var _searchPagesUrls = [];
var _resultLinks;
var pageHandler = function(uri) {
page.open(uri, function (status) {
if (status === 'success') {
page.injectJs('./libs/jquery-2.1.3.min.js');
_resultLinks = page.evaluate(function(_linkWrapperSelector) {
return $(_linkWrapperSelector).map(function () {
return this.href;
}).get();
});
_localInfo = page.evaluate(function(_locSelector) {
return $(_locSelector).text();
});
console.log(_resultLinks.join('\n'));
console.log(_resultLinks.length);
console.log(_localInfo);
setTimeout(nextPage, 1000);
}
});
};
var nextPage = function() {
var file = _searchPagesUrls.shift();
if(!file) phantom.exit();
pageHandler(file);
};
this.runSearch = function(keyPhrase, depthSearch) {
var url = _engineUrl+_engineRequest+keyPhrase;
for (var numPage = 0; numPage < depthSearch; numPage++) {
url += _engineNewPage+10*numPage;
_searchPagesUrls.push(url);
url = url.substr(0,url.indexOf(_engineNewPage));
}
nextPage();
};
this.showLinks = function() {
return _resultLinks.join('\n');
};
}
var googleOptions = {
rootDomain: 'http://google.ru/',
requestPrefix: 'search?q=',
newPagePrefix: '&start=',
linkWrapperSelector: 'li.g h3 a',
locSelector: '#swml_addr'
};
var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
var google = new searchEngine(googleOptions);
google.runSearch('Hello', 1);
不幸的是,它不起作用。我不明白为什么。也许我搞砸了范围界定。
P.S。此代码的第一个版本可以正常工作并在控制台中显示所有链接。第二版代码仅输出 0
,但将 uri 传递给 pageHandler
函数是正确的。甚至不显示 'undefined' 或类似的内容。
page.evaluate()
是沙盒页面上下文。它无权访问在其外部定义的变量。您必须明确地将 _linkWrapperSelector
传递给它:
_resultLinks = page.evaluate(function(_linkWrapperSelector) {
return $(_linkWrapperSelector).map(function () {
return this.href;
}).get();
}, _linkWrapperSelector); // this here
_locSelector
也是如此:
_localInfo = page.evaluate(function(_locSelector) {
return $(_locSelector).text();
}, _locSelector); // this here
我正在尝试为 Google 搜索结果编写抓取器。这是我写的:
var system = require('system');
var args = system.args;
var webPage = require('webpage');
var page = webPage.create();
var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
var request = "search?q=",
newPage = "&start=",
localInfo,
depth;
var gUrl = 'http://google.com/',
yaUrl = 'http://yahoo.com/',
url = '';
var searchPages = [],
links;
request += args[1];
url += gUrl + request;
depth = args[2];
function pageHandler(uri) {
page.open(uri, function (status) {
if (status === 'success') {
page.injectJs('./libs/jquery-2.1.3.min.js');
links = page.evaluate(function () {
return $("li.g h3 a").map(function () {
return this.href;
}).get();
});
localInfo = page.evaluate(function() {
return $("#swml_addr").text();
});
console.log(links.join('\n'));
console.log(links.length);
console.log(localInfo);
setTimeout(nextPage, 1000);
}
});
}
function prepareSearchPages() {
for (var numPage = 0; numPage < depth; numPage++) {
url += newPage+10*numPage;
searchPages.push(url);
url = url.substr(0,url.indexOf(newPage));
}
nextPage();
}
var nextPage = function() {
var file = searchPages.shift();
if(!file) phantom.exit();
pageHandler(file);
};
prepareSearchPages();
很可能所有人都认为它看起来很糟糕。但效果很好。所以,我决定为搜索引擎分配一个单独的实体
var webPage = require('webpage');
var page = webPage.create();
function searchEngine(engConfig) {
var _engineUrl = engConfig.rootDomain;
var _engineRequest = engConfig.requestPrefix;
var _engineNewPage = engConfig.newPagePrefix;
var _linkWrapperSelector = engConfig.linkWrapperSelector;
var _locSelector = engConfig.locSelector;
var _localInfo;
var _searchPagesUrls = [];
var _resultLinks;
var pageHandler = function(uri) {
page.open(uri, function (status) {
if (status === 'success') {
page.injectJs('./libs/jquery-2.1.3.min.js');
_resultLinks = page.evaluate(function(_linkWrapperSelector) {
return $(_linkWrapperSelector).map(function () {
return this.href;
}).get();
});
_localInfo = page.evaluate(function(_locSelector) {
return $(_locSelector).text();
});
console.log(_resultLinks.join('\n'));
console.log(_resultLinks.length);
console.log(_localInfo);
setTimeout(nextPage, 1000);
}
});
};
var nextPage = function() {
var file = _searchPagesUrls.shift();
if(!file) phantom.exit();
pageHandler(file);
};
this.runSearch = function(keyPhrase, depthSearch) {
var url = _engineUrl+_engineRequest+keyPhrase;
for (var numPage = 0; numPage < depthSearch; numPage++) {
url += _engineNewPage+10*numPage;
_searchPagesUrls.push(url);
url = url.substr(0,url.indexOf(_engineNewPage));
}
nextPage();
};
this.showLinks = function() {
return _resultLinks.join('\n');
};
}
var googleOptions = {
rootDomain: 'http://google.ru/',
requestPrefix: 'search?q=',
newPagePrefix: '&start=',
linkWrapperSelector: 'li.g h3 a',
locSelector: '#swml_addr'
};
var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
var google = new searchEngine(googleOptions);
google.runSearch('Hello', 1);
不幸的是,它不起作用。我不明白为什么。也许我搞砸了范围界定。
P.S。此代码的第一个版本可以正常工作并在控制台中显示所有链接。第二版代码仅输出 0
,但将 uri 传递给 pageHandler
函数是正确的。甚至不显示 'undefined' 或类似的内容。
page.evaluate()
是沙盒页面上下文。它无权访问在其外部定义的变量。您必须明确地将 _linkWrapperSelector
传递给它:
_resultLinks = page.evaluate(function(_linkWrapperSelector) {
return $(_linkWrapperSelector).map(function () {
return this.href;
}).get();
}, _linkWrapperSelector); // this here
_locSelector
也是如此:
_localInfo = page.evaluate(function(_locSelector) {
return $(_locSelector).text();
}, _locSelector); // this here