使用 puppeteer 获取数据
Fetching data with puppeteer
我尝试使用 Puppeteer 从第一个搜索页面获取所有 link 图像,但我只从总共 40 张图像中获取 6 link 图像。这是我的代码:
const puppeteer = require('puppeteer');
puppeteer.launch({ headless: true }).then(async browser => {
const page = await browser.newPage();
await page.goto('https://shopee.vn/search?keyword=iphone%20xs' , {waitUntil: 'networkidle0'});
const links = await page.evaluate( () => {
let products_result = document.getElementsByClassName("_1T9dHf _3XaILN");
let images = [];
for(let i=0; i<products_result.length; i++){
images[i] = products_result[i].src;
}
return images;
});
for(let i=0; i<links.length;i++){
console.log('Links of ' + i +' images : ',links[i]);
}
await browser.close();
});
我应该如何解决才能从第一个搜索页获得总计 40 link 秒?谢谢
我相信有问题的网站正在执行一些延迟加载图像和/或一些按需 DOM 操作。
所以,我们将尝试向下滚动页面,使用这个优秀答案的改编版:
.
我们还做的是拍摄页面图像(在您 运行 脚本所在的目录中打开 ./page.png!),这可以帮助您查看正在加载的内容(或不!)
代码如下:
const puppeteer = require('puppeteer');
// Scroll downwards slowly
async function scroll(page){
await page.evaluate(async () => {
await new Promise(resolve => {
// Adjust as necessary
const y = 50, speed = 20;
let heightScrolled = 0;
setInterval(() => {
window.scrollBy(0, y);
heightScrolled += y;
if (heightScrolled >= document.body.scrollHeight) {
resolve();
}
}, speed);
});
});
}
async function getImages(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {waitUntil: 'networkidle0'});
await page.setViewport({
width: 1200,
height: 800
});
await scroll(page);
// Take an image of the page.. see what it looks like!
await page.screenshot({
fullPage: true,
path: `./page.png`
});
const links = await page.evaluate( () => {
let products_result = document.getElementsByClassName("_1T9dHf _3XaILN");
let images = [];
for(let i=0; i<products_result.length; i++){
images[i] = products_result[i].src;
}
return images;
});
for(let i=0; i<links.length;i++){
console.log('Links of ' + i +' images : ',links[i]);
}
await browser.close();
}
let url = 'https://shopee.vn/search?keyword=iphone%20xs'
getImages(url);
我尝试使用 Puppeteer 从第一个搜索页面获取所有 link 图像,但我只从总共 40 张图像中获取 6 link 图像。这是我的代码:
const puppeteer = require('puppeteer');
puppeteer.launch({ headless: true }).then(async browser => {
const page = await browser.newPage();
await page.goto('https://shopee.vn/search?keyword=iphone%20xs' , {waitUntil: 'networkidle0'});
const links = await page.evaluate( () => {
let products_result = document.getElementsByClassName("_1T9dHf _3XaILN");
let images = [];
for(let i=0; i<products_result.length; i++){
images[i] = products_result[i].src;
}
return images;
});
for(let i=0; i<links.length;i++){
console.log('Links of ' + i +' images : ',links[i]);
}
await browser.close();
});
我应该如何解决才能从第一个搜索页获得总计 40 link 秒?谢谢
我相信有问题的网站正在执行一些延迟加载图像和/或一些按需 DOM 操作。
所以,我们将尝试向下滚动页面,使用这个优秀答案的改编版:
我们还做的是拍摄页面图像(在您 运行 脚本所在的目录中打开 ./page.png!),这可以帮助您查看正在加载的内容(或不!) 代码如下:
const puppeteer = require('puppeteer');
// Scroll downwards slowly
async function scroll(page){
await page.evaluate(async () => {
await new Promise(resolve => {
// Adjust as necessary
const y = 50, speed = 20;
let heightScrolled = 0;
setInterval(() => {
window.scrollBy(0, y);
heightScrolled += y;
if (heightScrolled >= document.body.scrollHeight) {
resolve();
}
}, speed);
});
});
}
async function getImages(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {waitUntil: 'networkidle0'});
await page.setViewport({
width: 1200,
height: 800
});
await scroll(page);
// Take an image of the page.. see what it looks like!
await page.screenshot({
fullPage: true,
path: `./page.png`
});
const links = await page.evaluate( () => {
let products_result = document.getElementsByClassName("_1T9dHf _3XaILN");
let images = [];
for(let i=0; i<products_result.length; i++){
images[i] = products_result[i].src;
}
return images;
});
for(let i=0; i<links.length;i++){
console.log('Links of ' + i +' images : ',links[i]);
}
await browser.close();
}
let url = 'https://shopee.vn/search?keyword=iphone%20xs'
getImages(url);