使用 puppeteer 抓取 google 地图时滚动不起作用
Scrolling not working while scraping google maps using puppeteer
我正在抓取 google 地图位置数据,但发生的情况是它只 returns 我的用户评论的前 10 个结果,而不是之后的。我认为滚动功能存在一些问题。
const puppeteer = require('puppeteer');
function extractItems() {
const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeItems(
page,
extractItems,
itemCount,
scrollDelay = 2000,
) {
let items = [];
try {
let previousHeight;
while (items.length < itemCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('div.m6QErb.DxyBCb.scrollHeight');//selector for scroller
await page.evaluate('window.scrollTo(0, div.m6QErb.DxyBCb.scrollHeight)');
await page.waitForFunction(`div.m6QErb.DxyBCb.scrollHeight > ${previousHeight}`);
await page.waitForTimeout(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
let browser = await puppeteer.connect();
browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const [page] = await browser.pages();
page.setViewport({ width: 1280, height: 926 });
await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');
// Auto-scroll and extract desired items from the page. Currently set to extract eight items.
const items = await scrapeItems(page, extractItems, 30);
console.log(items)
await browser.close();
})();
此代码运行正常:
'use strict'
const puppeteer = require('puppeteer');
function extractItems() {
const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeItems(
page,
extractItems,
itemCount,
scrollDelay = 2000,
) {
let items = [];
try {
let previousHeight;
while (items.length < itemCount) {
console.log(`items.length: ${items.length} itemCount: ${itemCount}`)
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate(() => {
const scroller = document.querySelector('div.m6QErb.DxyBCb')
return scroller.scrollHeight
})
await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight})`);
await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight}`);
await page.waitForTimeout(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const [page] = await browser.pages();
page.setViewport({ width: 1280, height: 926 });
await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');
// Auto-scroll and extract desired items from the page. Currently set to extract eight items.
const items = await scrapeItems(page, extractItems, 30);
console.log(items)
await browser.close();
})();
所以我才发现我必须在评估时添加document.querySelector
滚动高度以及检查滚动高度是否大于之前的高度时。
items = await page.evaluate(extractItems);
previousHeight = page.evaluate('document.querySelector("div.m6QErb.DxyBCb").scrollHeight');
await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight[0]})`);
await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight[0]}`);
await page.waitForTimeout(scrollDelay);
我正在抓取 google 地图位置数据,但发生的情况是它只 returns 我的用户评论的前 10 个结果,而不是之后的。我认为滚动功能存在一些问题。
const puppeteer = require('puppeteer');
function extractItems() {
const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeItems(
page,
extractItems,
itemCount,
scrollDelay = 2000,
) {
let items = [];
try {
let previousHeight;
while (items.length < itemCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('div.m6QErb.DxyBCb.scrollHeight');//selector for scroller
await page.evaluate('window.scrollTo(0, div.m6QErb.DxyBCb.scrollHeight)');
await page.waitForFunction(`div.m6QErb.DxyBCb.scrollHeight > ${previousHeight}`);
await page.waitForTimeout(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
let browser = await puppeteer.connect();
browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const [page] = await browser.pages();
page.setViewport({ width: 1280, height: 926 });
await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');
// Auto-scroll and extract desired items from the page. Currently set to extract eight items.
const items = await scrapeItems(page, extractItems, 30);
console.log(items)
await browser.close();
})();
此代码运行正常:
'use strict'
const puppeteer = require('puppeteer');
function extractItems() {
const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeItems(
page,
extractItems,
itemCount,
scrollDelay = 2000,
) {
let items = [];
try {
let previousHeight;
while (items.length < itemCount) {
console.log(`items.length: ${items.length} itemCount: ${itemCount}`)
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate(() => {
const scroller = document.querySelector('div.m6QErb.DxyBCb')
return scroller.scrollHeight
})
await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight})`);
await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight}`);
await page.waitForTimeout(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const [page] = await browser.pages();
page.setViewport({ width: 1280, height: 926 });
await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');
// Auto-scroll and extract desired items from the page. Currently set to extract eight items.
const items = await scrapeItems(page, extractItems, 30);
console.log(items)
await browser.close();
})();
所以我才发现我必须在评估时添加document.querySelector
滚动高度以及检查滚动高度是否大于之前的高度时。
items = await page.evaluate(extractItems);
previousHeight = page.evaluate('document.querySelector("div.m6QErb.DxyBCb").scrollHeight');
await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight[0]})`);
await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight[0]}`);
await page.waitForTimeout(scrollDelay);