Puppeteer 元素 console.log'able 但 return 在 puppeteer 中未定义
Puppeteer element is console.log'able but return undefined in puppeteer
我正在尝试抓取 a
标签下有 h3
标签的网页。我得到 a
标签就好了,但是当试图得到 h3
的 innerText 时,我得到一个 undefined
值。
这是我要抓取的内容:
const puppeteer = require('puppeteer');
const pageURL = "https://producthunt.com";
const webScraping = async pageURL => {
const browser = await puppeteer.launch({
headless: false,
arges: ["--no-sandbox"]
});
const page = await browser.newPage();
let dataObj = {};
try {
await page.goto(pageURL, { waitUntil: 'networkidle2' });
const publishedNews = await page.evaluate(() => {
const newsDOM = document.querySelectorAll("main ul li");
let newsList = [];
newsDOM.forEach(linkElement => {
const text = linkElement.querySelector("a").textContent;
const innerText = linkElement.querySelector("a").innerText;
const url = linkElement.querySelector("a").getAttribute('href');
const title = linkElement.querySelector("h3").innerText;
console.log(title);
newsList.push({
title,
text,
url
});
});
return newsList;
});
dataObj = {
amount: publishedNews.length,
publishedNews
};
} catch (e) {
console.log(e);
}
console.log(dataObj);
browser.close();
return dataObj;
};
webScraping(pageURL).catch(console.error);
控制台日志运行良好,但 puppeteer 抛出:
Cannot read property 'innerText' of null
看起来您的解决方案运行良好,但您无法控制 h3
标记是否为 null。尝试在访问 innerText 属性之前添加 if
语句,或者使用我在下面留下的代码。
const puppeteer = require('puppeteer');
const pageURL = "https://producthunt.com";
const webScraping = async pageURL => {
const browser = await puppeteer.launch({
headless: false,
arges: ["--no-sandbox"]
});
const page = await browser.newPage();
let dataObj = {};
try {
await page.goto(pageURL, { waitUntil: 'networkidle2' });
const publishedNews = await page.evaluate(() => {
let newsList = [];
const newsDOM = document.querySelectorAll("main ul li");
newsDOM.forEach(linkElement => {
const aTag = linkElement.querySelector("a");
const text = aTag.textContent;
const innerText = aTag.innerText;
const url = aTag.getAttribute('href');
let title = aTag.querySelector("h3");
// there may be some <a> without an h3, control
// the null pointer exception here, accessing only
// if title is not 'null'.
if (title) title = title.innerText;
console.log(title);
// changed the object structure to add a key for each attr
newsList.push({
title: title,
text: text,
url: url
});
});
return newsList;
});
// changed the object structure to add a key for the array
dataObj = {
amount: publishedNews.length,
list: publishedNews
};
} catch (e) {
console.log(e);
}
console.log({receivedData: dataObj});
browser.close();
return dataObj;
};
webScraping(pageURL).catch(console.error);
如果这能解决您的问题,请告诉我!
我正在尝试抓取 a
标签下有 h3
标签的网页。我得到 a
标签就好了,但是当试图得到 h3
的 innerText 时,我得到一个 undefined
值。
这是我要抓取的内容:
const puppeteer = require('puppeteer');
const pageURL = "https://producthunt.com";
const webScraping = async pageURL => {
const browser = await puppeteer.launch({
headless: false,
arges: ["--no-sandbox"]
});
const page = await browser.newPage();
let dataObj = {};
try {
await page.goto(pageURL, { waitUntil: 'networkidle2' });
const publishedNews = await page.evaluate(() => {
const newsDOM = document.querySelectorAll("main ul li");
let newsList = [];
newsDOM.forEach(linkElement => {
const text = linkElement.querySelector("a").textContent;
const innerText = linkElement.querySelector("a").innerText;
const url = linkElement.querySelector("a").getAttribute('href');
const title = linkElement.querySelector("h3").innerText;
console.log(title);
newsList.push({
title,
text,
url
});
});
return newsList;
});
dataObj = {
amount: publishedNews.length,
publishedNews
};
} catch (e) {
console.log(e);
}
console.log(dataObj);
browser.close();
return dataObj;
};
webScraping(pageURL).catch(console.error);
控制台日志运行良好,但 puppeteer 抛出:
Cannot read property 'innerText' of null
看起来您的解决方案运行良好,但您无法控制 h3
标记是否为 null。尝试在访问 innerText 属性之前添加 if
语句,或者使用我在下面留下的代码。
const puppeteer = require('puppeteer');
const pageURL = "https://producthunt.com";
const webScraping = async pageURL => {
const browser = await puppeteer.launch({
headless: false,
arges: ["--no-sandbox"]
});
const page = await browser.newPage();
let dataObj = {};
try {
await page.goto(pageURL, { waitUntil: 'networkidle2' });
const publishedNews = await page.evaluate(() => {
let newsList = [];
const newsDOM = document.querySelectorAll("main ul li");
newsDOM.forEach(linkElement => {
const aTag = linkElement.querySelector("a");
const text = aTag.textContent;
const innerText = aTag.innerText;
const url = aTag.getAttribute('href');
let title = aTag.querySelector("h3");
// there may be some <a> without an h3, control
// the null pointer exception here, accessing only
// if title is not 'null'.
if (title) title = title.innerText;
console.log(title);
// changed the object structure to add a key for each attr
newsList.push({
title: title,
text: text,
url: url
});
});
return newsList;
});
// changed the object structure to add a key for the array
dataObj = {
amount: publishedNews.length,
list: publishedNews
};
} catch (e) {
console.log(e);
}
console.log({receivedData: dataObj});
browser.close();
return dataObj;
};
webScraping(pageURL).catch(console.error);
如果这能解决您的问题,请告诉我!