仅抓取功能 returns 其中一个链接,而不是我试图获取的所有产品
Scrape function only returns one of the links instead of all of the products im trying to get
我对 node.js 和木偶师真的很陌生。
当我搜索并保存它们时,我试图获取产品的所有链接,但它只保存第一个而不保存其他链接。不知道是我的选择器错了还是代码错了
const scraperObject = {
url: 'https://diaonline.supermercadosdia.com.ar/busca/?ft=pepsi',
async scraper(browser){
let page = await browser.newPage();
console.log(`Navigating to ${this.url}...`);
await page.goto(this.url);
// Wait for the required DOM to be rendered
await page.waitForSelector('.wrapper > .main');
// Get the link to all the required products
let urls = await page.$$eval('section > div.coleccion-prods > div > div.vitrine.resultItemsWrapper', links => {
links = links.filter(link => link.querySelector('.marca').textContent !== "PEPSI")
//Extract the links from the data
links = links.map(el => el.querySelector('h3 > a').href)
return links;
});
console.log(urls);
}
}
module.exports = scraperObject;
node.js 比较。 18.1.0
木偶师 VS. 14.1.2
尝试这样的事情:
import ppt from 'puppeteer';
const url = 'https://diaonline.supermercadosdia.com.ar/busca/?ft=pepsi';
const selectors = {
main: '.wrapper > .main',
productLink: "//div[contains(@class, 'product-name')]/h3/a/@href",
};
(async () => {
const browser = await ppt.launch({
headless: true,
devtools: false,
args: [
'--window-size=1600,1200',
'--disable-web-security',
'--disable-site-isolation-trials',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582');
await page.goto(url);
await page.waitForSelector(selectors.main);
await page.waitForXPath(selectors.productLink, { timeout: 3000 });
const handles = await page.$x(selectors.productLink);
const links = await Promise.all(handles.map((h) => h.evaluate((elm) => elm.textContent)));
await browser.close();
console.log(links);
})();
[
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-3-lts-120091/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-500-ml-56079/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-black-15-lts-247793/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-black-225-lts-239383/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-pepsi-black-lata-354-cc-275841/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-light-225-lts-108411/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-pepsi-cola-225-lts-199640/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-en-lata-354-ml-68235/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-15-lts-39692/p'
]
如果我可以建议,请查看 XPath
选择器。它们比标准 css 选择器更通用。
我对 node.js 和木偶师真的很陌生。
当我搜索并保存它们时,我试图获取产品的所有链接,但它只保存第一个而不保存其他链接。不知道是我的选择器错了还是代码错了
const scraperObject = {
url: 'https://diaonline.supermercadosdia.com.ar/busca/?ft=pepsi',
async scraper(browser){
let page = await browser.newPage();
console.log(`Navigating to ${this.url}...`);
await page.goto(this.url);
// Wait for the required DOM to be rendered
await page.waitForSelector('.wrapper > .main');
// Get the link to all the required products
let urls = await page.$$eval('section > div.coleccion-prods > div > div.vitrine.resultItemsWrapper', links => {
links = links.filter(link => link.querySelector('.marca').textContent !== "PEPSI")
//Extract the links from the data
links = links.map(el => el.querySelector('h3 > a').href)
return links;
});
console.log(urls);
}
}
module.exports = scraperObject;
node.js 比较。 18.1.0
木偶师 VS. 14.1.2
尝试这样的事情:
import ppt from 'puppeteer';
const url = 'https://diaonline.supermercadosdia.com.ar/busca/?ft=pepsi';
const selectors = {
main: '.wrapper > .main',
productLink: "//div[contains(@class, 'product-name')]/h3/a/@href",
};
(async () => {
const browser = await ppt.launch({
headless: true,
devtools: false,
args: [
'--window-size=1600,1200',
'--disable-web-security',
'--disable-site-isolation-trials',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582');
await page.goto(url);
await page.waitForSelector(selectors.main);
await page.waitForXPath(selectors.productLink, { timeout: 3000 });
const handles = await page.$x(selectors.productLink);
const links = await Promise.all(handles.map((h) => h.evaluate((elm) => elm.textContent)));
await browser.close();
console.log(links);
})();
[
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-3-lts-120091/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-500-ml-56079/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-black-15-lts-247793/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-black-225-lts-239383/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-pepsi-black-lata-354-cc-275841/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-light-225-lts-108411/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-pepsi-cola-225-lts-199640/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-en-lata-354-ml-68235/p',
'https://diaonline.supermercadosdia.com.ar/gaseosa-cola-pepsi-15-lts-39692/p'
]
如果我可以建议,请查看 XPath
选择器。它们比标准 css 选择器更通用。