继续处理结果的空值(Nodejs、Puppeteer)
Continue on Null Value of Result (Nodejs, Puppeteer)
我刚开始玩 Puppeteer(Headless Chrome)和 Nodejs。我正在抓取一些测试站点,当所有值都存在时一切正常,但如果缺少值,我会收到如下错误:
Cannot read property 'src' of null
(所以在下面的代码中,前两遍可能有所有值,但第三遍没有图片,所以它只是错误)。
在我使用 if(!picture) continue;
之前,但我认为由于 for 循环,它现在不起作用。
任何帮助将不胜感激,谢谢!
for (let i = 1; i <= 3; i++) {
//...Getting to correct page and scraping it three times
const result = await page.evaluate(() => {
let title = document.querySelector('h1').innerText;
let article = document.querySelector('.c-entry-content').innerText;
let picture = document.querySelector('.c-picture img').src;
if (!document.querySelector('.c-picture img').src) {
let picture = 'No Link'; } //throws error
let source = "The Verge";
let categories = "Tech";
if (!picture)
continue; //throws error
return {
title,
article,
picture,
source,
categories
}
});
}
let picture = document.querySelector('.c-picture img').src;
if (!document.querySelector('.c-picture img').src) {
let picture = 'No Link'; } //throws error
如果没有图片,则document.querySelector()
returns null,其中没有src
属性。在尝试读取 src
属性.
之前,您需要检查您的查询是否找到了一个元素
将 null 检查移至函数的顶部还有一个好处,即当您无论如何都打算退出时,可以节省不必要的计算。
async function scrape3() {
// ...
for (let i = 1; i <= 3; i++) {
//...Getting to correct page and scraping it three times
const result = await page.evaluate(() => {
const pictureElement = document.querySelector('.c-picture img');
if (!pictureElement) return null;
const picture = pictureElement.src;
const title = document.querySelector('h1').innerText;
const article = document.querySelector('.c-entry-content').innerText;
const source = "The Verge";
const categories = "Tech";
return {
title,
article,
picture,
source,
categories
}
});
if (!result) continue;
// ... do stuff with result
}
回答评论问题:"Is there a way just to skip anything blank, and return the rest?"
是的。在尝试从中读取 属性 之前,您只需要检查每个可能丢失的元素是否存在。在这种情况下,我们可以省略早期的 return,因为您总是对所有结果感兴趣。
async function scrape3() {
// ...
for (let i = 1; i <= 3; i++) {
const result = await page.evaluate(() => {
const img = document.querySelector('.c-picture img');
const h1 = document.querySelector('h1');
const content = document.querySelector('.c-entry-content');
const picture = img ? img.src : '';
const title = h1 ? h1.innerText : '';
const article = content ? content.innerText : '';
const source = "The Verge";
const categories = "Tech";
return {
title,
article,
picture,
source,
categories
}
});
// ...
}
}
进一步思考
由于我仍在回答这个问题,让我更进一步,并使用您可能感兴趣的一些更高级别的技术对其进行重构。不确定这是否正是您所追求的,但是它应该会给您一些关于编写更易于维护的代码的想法。
// Generic reusable helper to return an object property
// if object exists and has property, else a default value
//
// This is a curried function accepting one argument at a
// time and capturing each parameter in a closure.
//
const maybeGetProp = default => key => object =>
(object && object.hasOwnProperty(key)) ? object.key : default
// Pass in empty string as the default value
//
const getPropOrEmptyString = maybeGetProp('')
// Apply the second parameter, the property name, making 2
// slightly different functions which have a default value
// and a property name pre-loaded. Both functions only need
// an object passed in to return either the property if it
// exists or an empty string.
//
const maybeText = getPropOrEmptyString('innerText')
const maybeSrc = getPropOrEmptyString('src')
async function scrape3() {
// ...
// The _ parameter name is acknowledging that we expect a
// an argument passed in but saying we plan to ignore it.
//
const evaluate = _ => page.evaluate(() => {
// Attempt to retrieve the desired elements
//
const img = document.querySelector('.c-picture img');
const h1 = document.querySelector('h1')
const content = document.querySelector('.c-entry-content')
// Return the results, with empty string in
// place of any missing properties.
//
return {
title: maybeText(h1),
article: maybeText(article),
picture: maybeSrc(img),
source: 'The Verge',
categories: 'Tech'
}
}))
// Start with an empty array of length 3
//
const evaluations = Array(3).fill()
// Then map over that array ignoring the undefined
// input and return a promise for a page evaluation
//
.map(evaluate)
// All 3 scrapes are occuring concurrently. We'll
// wait for all of them to finish.
//
const results = await Promise.all(evaluations)
// Now we have an array of results, so we can
// continue using array methods to iterate over them
// or otherwise manipulate or transform them
//
results
.filter(result => result.title && result.picture)
.forEach(result => {
//
// Do something with each result
//
})
}
Try-catch 对我有用:
try {
if (await page.$eval('element')!==null) {
const name = await page.$eval('element')
}
}catch(error){
name = ''
}
我刚开始玩 Puppeteer(Headless Chrome)和 Nodejs。我正在抓取一些测试站点,当所有值都存在时一切正常,但如果缺少值,我会收到如下错误:
Cannot read property 'src' of null
(所以在下面的代码中,前两遍可能有所有值,但第三遍没有图片,所以它只是错误)。
在我使用 if(!picture) continue;
之前,但我认为由于 for 循环,它现在不起作用。
任何帮助将不胜感激,谢谢!
for (let i = 1; i <= 3; i++) {
//...Getting to correct page and scraping it three times
const result = await page.evaluate(() => {
let title = document.querySelector('h1').innerText;
let article = document.querySelector('.c-entry-content').innerText;
let picture = document.querySelector('.c-picture img').src;
if (!document.querySelector('.c-picture img').src) {
let picture = 'No Link'; } //throws error
let source = "The Verge";
let categories = "Tech";
if (!picture)
continue; //throws error
return {
title,
article,
picture,
source,
categories
}
});
}
let picture = document.querySelector('.c-picture img').src;
if (!document.querySelector('.c-picture img').src) {
let picture = 'No Link'; } //throws error
如果没有图片,则document.querySelector()
returns null,其中没有src
属性。在尝试读取 src
属性.
将 null 检查移至函数的顶部还有一个好处,即当您无论如何都打算退出时,可以节省不必要的计算。
async function scrape3() {
// ...
for (let i = 1; i <= 3; i++) {
//...Getting to correct page and scraping it three times
const result = await page.evaluate(() => {
const pictureElement = document.querySelector('.c-picture img');
if (!pictureElement) return null;
const picture = pictureElement.src;
const title = document.querySelector('h1').innerText;
const article = document.querySelector('.c-entry-content').innerText;
const source = "The Verge";
const categories = "Tech";
return {
title,
article,
picture,
source,
categories
}
});
if (!result) continue;
// ... do stuff with result
}
回答评论问题:"Is there a way just to skip anything blank, and return the rest?"
是的。在尝试从中读取 属性 之前,您只需要检查每个可能丢失的元素是否存在。在这种情况下,我们可以省略早期的 return,因为您总是对所有结果感兴趣。
async function scrape3() {
// ...
for (let i = 1; i <= 3; i++) {
const result = await page.evaluate(() => {
const img = document.querySelector('.c-picture img');
const h1 = document.querySelector('h1');
const content = document.querySelector('.c-entry-content');
const picture = img ? img.src : '';
const title = h1 ? h1.innerText : '';
const article = content ? content.innerText : '';
const source = "The Verge";
const categories = "Tech";
return {
title,
article,
picture,
source,
categories
}
});
// ...
}
}
进一步思考
由于我仍在回答这个问题,让我更进一步,并使用您可能感兴趣的一些更高级别的技术对其进行重构。不确定这是否正是您所追求的,但是它应该会给您一些关于编写更易于维护的代码的想法。
// Generic reusable helper to return an object property
// if object exists and has property, else a default value
//
// This is a curried function accepting one argument at a
// time and capturing each parameter in a closure.
//
const maybeGetProp = default => key => object =>
(object && object.hasOwnProperty(key)) ? object.key : default
// Pass in empty string as the default value
//
const getPropOrEmptyString = maybeGetProp('')
// Apply the second parameter, the property name, making 2
// slightly different functions which have a default value
// and a property name pre-loaded. Both functions only need
// an object passed in to return either the property if it
// exists or an empty string.
//
const maybeText = getPropOrEmptyString('innerText')
const maybeSrc = getPropOrEmptyString('src')
async function scrape3() {
// ...
// The _ parameter name is acknowledging that we expect a
// an argument passed in but saying we plan to ignore it.
//
const evaluate = _ => page.evaluate(() => {
// Attempt to retrieve the desired elements
//
const img = document.querySelector('.c-picture img');
const h1 = document.querySelector('h1')
const content = document.querySelector('.c-entry-content')
// Return the results, with empty string in
// place of any missing properties.
//
return {
title: maybeText(h1),
article: maybeText(article),
picture: maybeSrc(img),
source: 'The Verge',
categories: 'Tech'
}
}))
// Start with an empty array of length 3
//
const evaluations = Array(3).fill()
// Then map over that array ignoring the undefined
// input and return a promise for a page evaluation
//
.map(evaluate)
// All 3 scrapes are occuring concurrently. We'll
// wait for all of them to finish.
//
const results = await Promise.all(evaluations)
// Now we have an array of results, so we can
// continue using array methods to iterate over them
// or otherwise manipulate or transform them
//
results
.filter(result => result.title && result.picture)
.forEach(result => {
//
// Do something with each result
//
})
}
Try-catch 对我有用:
try {
if (await page.$eval('element')!==null) {
const name = await page.$eval('element')
}
}catch(error){
name = ''
}