Browserify 一个用 casperJS 编写的爬虫
Browserify a scraper written in casperJS
我正在尝试让这段代码在浏览器中运行。
scrape.html
<!doctype html>
<html>
<head>
<title> </title>
<style>
label {
margin-bottom: 2%;
}
div {
margin-bottom: 2%;
}
</style>
<script src = "../../AppData/Roaming/npm/node_modules/phantomjs/lib/phantomjs.js"></script>
<script src = "../../AppData/Roaming/npm/node_modules/casperjs/modules/casper.js"></script>
</head>
<body>
<form action="#" id = "form" method="get">
<label for="start">Start Page</label>
<div>
<input type = "number" name = "number1" value = "start"></input>
</div>
<label for="end">End Page</label>
<div>
<input type = "number" name = "number2" value = "end"></input>
</div>
<button onclick="myFunction()"> Submit </button>
</form>
<script>
function myFunction() {
var x = document.getElementById("form");
var number = [];
var i;
for (i = 0; i < x.length-1 ;i++) {
number.push(x.elements[i].value);
}
console.log(number);
//var casper = require('casper').create();
casper.then(function(){
console.log(this.fetchText('div.info-list-text'));
var startUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number1*3';
var endUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number2*3'
});
}
</script>
</body>
</html>
它产生以下错误,
casper.js:32 Uncaught ReferenceError: patchRequire is not defined
我认为错误是因为我们不能像在 Node.js 中那样使用 require
在浏览器中导入模块。为了使此功能在浏览器中可用,我在我的项目文件夹中安装了 browserify
并创建了以下 JS 文件。
browserReq.js
var casper = require('casper').create();
var url = 'ok,-MI'
var baseUrl = 'http://www.bedandbeyond.com/comm/c/'+url;
console.log(baseUrl);
var nextBtn = "a.navigation-button.next";
var allLinks = [];
casper.start(baseUrl);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
//this.echo(this.getCurrentUrl());
//this.wait(1000);
}).then(processPage);
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}
casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
this.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');
jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&/g,"and");
jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');
//require('utils').dump(jsonObj);
//jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");
//jsonObj = JSON.stringify(jsonObj, null, '\t');
require('utils').dump(jsonObj);
});
});
});
我运行这个文件使用browserify browserReq.js -o browserReqOut.js -d
。
它给我以下错误,Cannot find module 'casper' from project folder location
。我已经在项目文件夹和全局安装了 casperJS。
更新 1:
我将 scrape.html
中表单元素的值发布到以下代码,
scrape.php
<?php $url = $_POST["urlToScrape"]; ?><br>
<?php $page1 = $_POST["number1"]; ?> <br>
<?php $page2 = $_POST["number2"]; ?><br>
<?php $newProxyList = explode(PHP_EOL, $_POST['proxy']); ?> <br>
<?php echo $url ?> <br>
<?php echo $page1 ?> <br>
<?php echo $page2 ?> <br>
<?php echo $newProxyList[0] ?> <br>
<?php echo "<script>
var casper = require('casper').create();
var baseUrl = 'http://www.houzz.com/professionals/c/Nashville,-TN';
console.log(baseUrl);
var nextBtn = 'a.navigation-button.next';
var allLinks = [];
casper.start(baseUrl);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
//this.wait(1000);
}).then(processPage);
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}
casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
this.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');
jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&/g,'and');
jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');
//require('utils').dump(jsonObj);
//jsonObj = JSON.stringify(jsonObj, null, '\t');
require('utils').dump(jsonObj);
});
});
});
</script>"
?>
</body>
</html>
它仍然给我同样的错误,Uncaught ReferenceError: require is not defined
。为什么在服务器上执行 PHP 并且 require
模块在服务器上也可用时出现此错误。
PhantomJS 是一个完整的浏览器,它有它的 own API。 CasperJS 使用 API 来做事。除非你在浏览器中以普通 JavaScript 实现完整的 PhantomJS API,否则你将无法浏览 CasperJS。
我正在尝试让这段代码在浏览器中运行。
scrape.html
<!doctype html>
<html>
<head>
<title> </title>
<style>
label {
margin-bottom: 2%;
}
div {
margin-bottom: 2%;
}
</style>
<script src = "../../AppData/Roaming/npm/node_modules/phantomjs/lib/phantomjs.js"></script>
<script src = "../../AppData/Roaming/npm/node_modules/casperjs/modules/casper.js"></script>
</head>
<body>
<form action="#" id = "form" method="get">
<label for="start">Start Page</label>
<div>
<input type = "number" name = "number1" value = "start"></input>
</div>
<label for="end">End Page</label>
<div>
<input type = "number" name = "number2" value = "end"></input>
</div>
<button onclick="myFunction()"> Submit </button>
</form>
<script>
function myFunction() {
var x = document.getElementById("form");
var number = [];
var i;
for (i = 0; i < x.length-1 ;i++) {
number.push(x.elements[i].value);
}
console.log(number);
//var casper = require('casper').create();
casper.then(function(){
console.log(this.fetchText('div.info-list-text'));
var startUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number1*3';
var endUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number2*3'
});
}
</script>
</body>
</html>
它产生以下错误,
casper.js:32 Uncaught ReferenceError: patchRequire is not defined
我认为错误是因为我们不能像在 Node.js 中那样使用 require
在浏览器中导入模块。为了使此功能在浏览器中可用,我在我的项目文件夹中安装了 browserify
并创建了以下 JS 文件。
browserReq.js
var casper = require('casper').create();
var url = 'ok,-MI'
var baseUrl = 'http://www.bedandbeyond.com/comm/c/'+url;
console.log(baseUrl);
var nextBtn = "a.navigation-button.next";
var allLinks = [];
casper.start(baseUrl);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
//this.echo(this.getCurrentUrl());
//this.wait(1000);
}).then(processPage);
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}
casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
this.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');
jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&/g,"and");
jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');
//require('utils').dump(jsonObj);
//jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");
//jsonObj = JSON.stringify(jsonObj, null, '\t');
require('utils').dump(jsonObj);
});
});
});
我运行这个文件使用browserify browserReq.js -o browserReqOut.js -d
。
它给我以下错误,Cannot find module 'casper' from project folder location
。我已经在项目文件夹和全局安装了 casperJS。
更新 1:
我将 scrape.html
中表单元素的值发布到以下代码,
scrape.php
<?php $url = $_POST["urlToScrape"]; ?><br>
<?php $page1 = $_POST["number1"]; ?> <br>
<?php $page2 = $_POST["number2"]; ?><br>
<?php $newProxyList = explode(PHP_EOL, $_POST['proxy']); ?> <br>
<?php echo $url ?> <br>
<?php echo $page1 ?> <br>
<?php echo $page2 ?> <br>
<?php echo $newProxyList[0] ?> <br>
<?php echo "<script>
var casper = require('casper').create();
var baseUrl = 'http://www.houzz.com/professionals/c/Nashville,-TN';
console.log(baseUrl);
var nextBtn = 'a.navigation-button.next';
var allLinks = [];
casper.start(baseUrl);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
//this.wait(1000);
}).then(processPage);
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}
casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
this.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');
jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&/g,'and');
jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');
//require('utils').dump(jsonObj);
//jsonObj = JSON.stringify(jsonObj, null, '\t');
require('utils').dump(jsonObj);
});
});
});
</script>"
?>
</body>
</html>
它仍然给我同样的错误,Uncaught ReferenceError: require is not defined
。为什么在服务器上执行 PHP 并且 require
模块在服务器上也可用时出现此错误。
PhantomJS 是一个完整的浏览器,它有它的 own API。 CasperJS 使用 API 来做事。除非你在浏览器中以普通 JavaScript 实现完整的 PhantomJS API,否则你将无法浏览 CasperJS。