用 "load more" 按钮 JS 抓取

Scrape with "load more" button JS

我正在尝试抓取带有加载更多按钮的网站,但我无法使用 in nightmare 执行递归函数。我的代码是这样的:

const Nightmare = require('nightmare');
const nightmare = Nightmare({
 show:true
});// }
const request = require('request');
const cheerio = require('cheerio');

let url = 'https://www.housers.com/es/proyectos/avanzado';
let propertyArray = [];

var getThePage = function() {

     nightmare
      .goto('https://www.housers.com/es/proyectos/avanzado')
      .wait(1500)
      .click('#loadMore')
      .evaluate(() =>{
         return document.querySelector('.all-info').innerHTML;
       })
     .end()
     .then((result) => {
        let $ = cheerio.load(result);
        let loadMore = $('#loadMore')
        if (loadMore) {
            getThePage();
        }
        return result
        })
       .catch((error) => {
        console.error('Search failed:', error);
        });
        }
    getThePage()

不知道你有没有办法用这个方法或者其他的办法

如果要抓取 table 中的数据,则不需要使用 nightmare。在网络选项卡中,您会看到它调用此端点:

https://www.housers.com/es/proyectos/avanzado/scroll

加上一些分页和页面大小,我们假设每页 200(不知道是否超出限制)。

然后你只需要解析 html 并将数据放入数组中:

const axios = require('axios');
const querystring = require('querystring');
const cheerio = require('cheerio');
const entities = require("entities");

const url = 'https://www.housers.com/es/proyectos/avanzado/scroll';

const prices = [];

function doRequest(url, page){
  return axios.post(url + '?page=' + page + '&size=200', querystring.stringify({
    word: "",
    country: "",
    type: "",
    order: "STOCK_PRICE_VARIATION",
    orderDirection: "DESC"
  }));
}

async function getPrices() {
  var empty = false;
  var page = 0;

  while (!empty) {
    //call API
    console.log("GET page n°" + page);
    var res = await doRequest(url, page);
    page++;

    //parse HTML
    const $ = cheerio.load(res.data,{ 
      xmlMode: true,
      normalizeWhitespace: true,
      decodeEntities: true
    });

    if (res.data.trim() !== ""){
      //extract prices : put it in array
      $('tr').map(function(){
        var obj = [];
        $(this).children('td').map(function(){
          obj.push(entities.decodeHTML($(this).text().trim()));
        });
        prices.push(obj);
      });
    }
    else {
      empty = true;
    }
  }
  console.log(prices);
  console.log("total length : " + prices.length);
}

getPrices();