Browserify 一个用 casperJS 编写的爬虫

Browserify a scraper written in casperJS

我正在尝试让这段代码在浏览器中运行。

scrape.html

<!doctype html>

<html>
    <head>
        <title> </title>
        <style>
            label {
                margin-bottom: 2%;
            }

            div {
                margin-bottom: 2%;
            }
        </style>
        <script src = "../../AppData/Roaming/npm/node_modules/phantomjs/lib/phantomjs.js"></script>
        <script src = "../../AppData/Roaming/npm/node_modules/casperjs/modules/casper.js"></script>
    </head>

    <body>
        <form action="#" id = "form" method="get">
            <label for="start">Start Page</label>
            <div>
                <input type = "number" name = "number1" value = "start"></input>
            </div>
            <label for="end">End Page</label>
            <div>
                <input type = "number" name = "number2" value = "end"></input>
            </div>
            <button onclick="myFunction()"> Submit  </button>
        </form>
        <script> 
            function myFunction() {
                var x = document.getElementById("form");
                var number = [];
                var i;
                for (i = 0; i < x.length-1 ;i++) {
                    number.push(x.elements[i].value);
                    }
                console.log(number);
                //var casper = require('casper').create();
                casper.then(function(){
                    console.log(this.fetchText('div.info-list-text'));

                    var startUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number1*3';
                    var endUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number2*3'
                });
            }
        </script>
    </body>
</html>

它产生以下错误,

casper.js:32 Uncaught ReferenceError: patchRequire is not defined

我认为错误是因为我们不能像在 Node.js 中那样使用 require 在浏览器中导入模块。为了使此功能在浏览器中可用,我在我的项目文件夹中安装了 browserify 并创建了以下 JS 文件。

browserReq.js

var casper = require('casper').create();

var url = 'ok,-MI'
var baseUrl = 'http://www.bedandbeyond.com/comm/c/'+url;
console.log(baseUrl);

var nextBtn = "a.navigation-button.next";

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
  var pageData = this.evaluate(getPageData);
  allLinks = allLinks.concat(pageData);

  if (!this.exists(nextBtn)) {
    return;
  }

  this.thenClick(nextBtn).then(function() {
    //this.echo(this.getCurrentUrl());
    //this.wait(1000);
  }).then(processPage);
}

function getPageData(){
  //return document.title;
  var links = document.getElementsByClassName('pro-title');
  links = Array.prototype.map.call(links,function(link){
    return link.getAttribute('href');
  });
  return links;
}

casper.then(function(){
  //require('utils').dump(allLinks);
  this.each(allLinks,function(self,link){
    this.thenOpen(link,function(a){
      jsonObj = {};
      jsonObj.title = this.fetchText('a.profile-full-name');

      jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
      jsonObj.services = jsonObj.services.replace(/&amp;/g,"and");  

      jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
      //jsonObj.contact = this.fetchText('span.pro-contact-text');
      jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');  
      //jsonObj.description.replace(/\s/g, '');   

      //require('utils').dump(jsonObj);
      //jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");   

      //jsonObj = JSON.stringify(jsonObj, null, '\t');
      require('utils').dump(jsonObj);
    });
  });
});

我运行这个文件使用browserify browserReq.js -o browserReqOut.js -d

它给我以下错误,Cannot find module 'casper' from project folder location。我已经在项目文件夹和全局安装了 casperJS。

更新 1:

我将 scrape.html 中表单元素的值发布到以下代码,

scrape.php

<?php $url = $_POST["urlToScrape"]; ?><br>
<?php $page1 = $_POST["number1"]; ?> <br>
<?php $page2 = $_POST["number2"]; ?><br>
<?php $newProxyList = explode(PHP_EOL, $_POST['proxy']); ?> <br>

<?php echo $url ?> <br>
<?php echo $page1 ?> <br>
<?php echo $page2 ?> <br>
<?php echo $newProxyList[0] ?> <br>

<?php echo "<script> 

    var casper = require('casper').create();

var baseUrl = 'http://www.houzz.com/professionals/c/Nashville,-TN';
console.log(baseUrl);

var nextBtn = 'a.navigation-button.next';

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
  var pageData = this.evaluate(getPageData);
  allLinks = allLinks.concat(pageData);

  if (!this.exists(nextBtn)) {
    return;
  }

  this.thenClick(nextBtn).then(function() {
    this.echo(this.getCurrentUrl());
    //this.wait(1000);
  }).then(processPage);
}

function getPageData(){
  //return document.title;
  var links = document.getElementsByClassName('pro-title');
  links = Array.prototype.map.call(links,function(link){
    return link.getAttribute('href');
  });
  return links;
}

casper.then(function(){
  //require('utils').dump(allLinks);
  this.each(allLinks,function(self,link){
    this.thenOpen(link,function(a){
      jsonObj = {};
      jsonObj.title = this.fetchText('a.profile-full-name');

      jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
      jsonObj.services = jsonObj.services.replace(/&amp;/g,'and');  

      jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
      //jsonObj.contact = this.fetchText('span.pro-contact-text');
      jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');  
      //jsonObj.description.replace(/\s/g, '');   

      //require('utils').dump(jsonObj);   

      //jsonObj = JSON.stringify(jsonObj, null, '\t');
      require('utils').dump(jsonObj);
    });
  });
});

 </script>"
 ?>

</body>
</html>

它仍然给我同样的错误,Uncaught ReferenceError: require is not defined。为什么在服务器上执行 PHP 并且 require 模块在服务器上也可用时出现此错误。

PhantomJS 是一个完整的浏览器,它有它的 own API。 CasperJS 使用 API 来做事。除非你在浏览器中以普通 JavaScript 实现完整的 PhantomJS API,否则你将无法浏览 CasperJS。