使用 puppeteer 抓取 google 地图时滚动不起作用

Scrolling not working while scraping google maps using puppeteer

我正在抓取 google 地图位置数据,但发生的情况是它只 returns 我的用户评论的前 10 个结果,而不是之后的。我认为滚动功能存在一些问题。

const puppeteer = require('puppeteer');

function extractItems() {
  const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
  const items = [];
  for (let element of extractedElements) {
    items.push(element.innerText);
  }
  return items;
}

async function scrapeItems(
  page,
  extractItems,
  itemCount,
  scrollDelay = 2000,
) {
  let items = [];
  try {
    let previousHeight;
    while (items.length < itemCount) {
      items = await page.evaluate(extractItems);
      previousHeight = await page.evaluate('div.m6QErb.DxyBCb.scrollHeight');//selector for scroller
      await page.evaluate('window.scrollTo(0, div.m6QErb.DxyBCb.scrollHeight)');
      await page.waitForFunction(`div.m6QErb.DxyBCb.scrollHeight > ${previousHeight}`);
      await page.waitForTimeout(scrollDelay);
    }
  } catch(e) { }
  return items;
}

(async () => {
  let browser = await puppeteer.connect();
  browser = await puppeteer.launch({
    headless: false,
    args: ['--no-sandbox', '--disable-setuid-sandbox'],
  });
  const [page] = await browser.pages();
  page.setViewport({ width: 1280, height: 926 });

  await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');

  // Auto-scroll and extract desired items from the page. Currently set to extract eight items.
  const items = await scrapeItems(page, extractItems, 30);

  console.log(items)

  await browser.close();
})();

此代码运行正常:

'use strict'

const puppeteer = require('puppeteer');
function extractItems() {
  const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
  const items = [];
  for (let element of extractedElements) {
    items.push(element.innerText);
  }
  return items;
}
async function scrapeItems(
  page,
  extractItems,
  itemCount,
  scrollDelay = 2000,
) {
  let items = [];
  try {
    let previousHeight;
    while (items.length < itemCount) {
      console.log(`items.length: ${items.length} itemCount: ${itemCount}`)
      
      items = await page.evaluate(extractItems);

      previousHeight = await page.evaluate(() => { 
        const scroller = document.querySelector('div.m6QErb.DxyBCb') 
        return scroller.scrollHeight  
      })

      await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight})`);
      await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight}`);
      await page.waitForTimeout(scrollDelay);

    }
  } catch(e) { }
  return items;
}


(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    args: ['--no-sandbox', '--disable-setuid-sandbox'],
  });
  const [page] = await browser.pages();
  page.setViewport({ width: 1280, height: 926 });

  await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');

  // Auto-scroll and extract desired items from the page. Currently set to extract eight items.
  const items = await scrapeItems(page, extractItems, 30);

  console.log(items)

await browser.close();
})();

所以我才发现我必须在评估时添加document.querySelector 滚动高度以及检查滚动高度是否大于之前的高度时。

    items = await page.evaluate(extractItems);
    previousHeight = page.evaluate('document.querySelector("div.m6QErb.DxyBCb").scrollHeight');
    await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight[0]})`);
    await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight[0]}`);
    await page.waitForTimeout(scrollDelay);