在承诺有时间解决之前返回对象
Object is returned before promises have time to resolve
我正在尝试构建一个网络抓取工具来获取有关某些产品的信息并将它们存储在数据库中。我正在使用 Nightmare 获取 HTML 源代码(因为在创建页面内容之前 javascript 代码必须在服务器上 运行 ),然后我正在使用 Cheerio 解析该源代码。进行解析后,我必须为产品下载一些图像。我有一个简单的下载功能,并且根据我要下载的图像在服务器上是否可用,我想 return 一个字符串(或一个字符串数组)包含图片名称(我下载的)或我电脑的默认图片名称。我尝试调用下载函数作为承诺,当我知道有多个图像要下载时,我尝试传递 Promise.all() ,但无济于事。虽然我确信我的代码正在运行(图像按应有的方式下载,最终对象几乎在每个 属性 和值上看起来都很棒),但当我将对象打印到控制台,仍然持有 [Promise] / [ Promise { } ] 我不太确定如何解决这个问题。我很肯定这些承诺会解决,但是当我将结果对象输出到控制台时它们并没有解决。这是个问题,因为我必须传递要存储在数据库中的对象,而且我认为它们不会得到解决。
代码(减去确切链接)在下面:
const cheerio = require('cheerio')
const nightmare = require('nightmare')()
const download = require('image-downloader')
const settings = new function() {
this.baseURL = 'https://baseurl.whatever'
this.urlSearch = `${this.baseURL}/Product/Search?keyword=`
this.urlVariant = 'https://cdn.baseurl.whatever/Variant/'
this.urlProduct = 'https://cdn.baseurl.whatever/Product/'
this.imgPath = './img/'
}
var review_id = 0
function downloadImage(url, filepath, success, error) {
return download.image({ url, dest: filepath }).then(success, error)
}
const url = 'https://someurl.nevermind.meh/product?pid=50M3NUMB3R',
code = '50M3C0D3'
async function scrapeProduct(code) {
const product = await nightmare.goto(url)
.wait()
.evaluate(() => document.body.innerHTML)
.end()
.then(body => console.log(loadProduct(body, code)))
.catch(err => console.log(`There was an error: [${err}]`))
}
function loadProduct(body, code) {
$ = cheerio.load(body)
return {
title: $('li.LongName').text().trim(),
category: $('a#categoryTitleLink').text().trim(),
min_price: parseFloat($('span.MinPrice').text()),
max_price: parseFloat($('span.MaxPrice')?.text()) || parseFloat($('span.MinPrice').text()),
points: parseFloat($('div.AddtoCartUnderText span').text()),
variants: [...$('div.productDetailClassicRnd')].map(variant => {
const $field = $(variant).find('input'),
item_code = $field.attr('item_code')
if (item_code.split('-')[0] == code) return null
return {
code: item_code.split('-')[0],
title: $field.attr('item_name'),
image: downloadImage(
`${settings.urlVariant}${item_code.replace(' ', '%20')}`,
`${settings.imgPath}${item_code}`,
result => result.filename.split('/').reverse()[0],
_ => 'variant_default-VC.jpg'
)
}
}).filter(variant => variant !== null),
images: [...$('img#imgProduct')].map(image => {
const $image = $(image),
source = $image.attr('src')
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => 'product_default.jpg'
)
}),
other_images: [...$('img.productDetailOtherIMG')].map(image => {
const $image = $(image),
source = $image.attr('src')
// Check if the other image is not a default one
if (/default_\d{1,2}/.test(source)) return null
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => null
)
}).filter(other_image => other_image !== null),
how_to_use: $('span#HowToUse p')?.text().trim() || "",
technical_description: $('span#TechnicalDescription p')?.text().trim() || "",
product_description: $('span#ProductDescription p')?.text().trim() || "",
bought_with: [...$('a.redirectProductId')].map(item => $(item).attr('href').match(/=(\d+)$/)[1]),
rank: $('div.productAverageMainDiv').find('i.activeStar').length,
reviews_count: parseInt($('span#spnReviewCount').text()),
reviews: [...$('div.customerReviewsMainDiv')].map(review => {
const $review = $(review)
return {
id: ++review_id,
author: $review.find('div.customerName').text().trim(),
posted_at: $review.find('div.starIconsForReviews span').text().trim(),
rank: $review.find('span.productAverageMainDiv').find('i.activeStar').length,
message: $review.find('div.customerReviewDetail span').text().trim()
}
})
}
}
scrapeProduct(code)
我什至无法从生成的图像名称数组中过滤空值,因为一旦我到达过滤函数,这些承诺就不会解决。不知何故,我的印象是
images: downloadImage(
URL,
filepath,
resolve() {},
reject() {}
)
将等到 downloadImage 函数 return 为图像 属性 赋值,然后执行过滤函数。另一方面,鉴于我猜测在我的 downloadImage 函数有任何机会解决承诺之前很久执行就流向了 filter 函数,我会链接一个 .then() 到 downloadImage,但我不能,因为 downloadImage位于 map() 函数的 return 内 - 这是代码中 .filter() 函数后跟的那个。
如有任何帮助,我们将不胜感激!谢谢!
P.S.: 我很确定我正在监督一些基本的(合乎逻辑的)东西,或者我没有正确理解,我很抱歉浪费你的时间,但我正在努力解决这个问题搞了两天,好像没什么想法了^_^
您将获得 map() 方法返回的 Promise 数组,因此您需要使用 Promise.all() 或其变体之一。
例如,这里你得到了“images”的promises数组,然后你使用Promise.all()等待所有的promises被解决,最后你链接一个then()来使用值。
const imagesPromises = [...$('img#imgProduct')].map(image => {
const $image = $(image),
source = $image.attr('src')
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => 'product_default.jpg'
)
})
return Promise.all(imagesPromises)
.then(images => {
return {
images,
...
}
})
这里有一个可能的实现:
function loadProduct(body, code) {
$ = cheerio.load(body)
const result = {
title: $('li.LongName').text().trim(),
category: $('a#categoryTitleLink').text().trim(),
min_price: parseFloat($('span.MinPrice').text()),
max_price: parseFloat($('span.MaxPrice')?.text()) || parseFloat($('span.MinPrice').text()),
points: parseFloat($('div.AddtoCartUnderText span').text()),
variants: [...$('div.productDetailClassicRnd')].map(variant => {
const $field = $(variant).find('input'),
item_code = $field.attr('item_code')
if (item_code.split('-')[0] == code) return null
return {
code: item_code.split('-')[0],
title: $field.attr('item_name'),
image: downloadImage(
`${settings.urlVariant}${item_code.replace(' ', '%20')}`,
`${settings.imgPath}${item_code}`,
result => result.filename.split('/').reverse()[0],
_ => 'variant_default-VC.jpg'
)
}
}).filter(variant => variant !== null),
how_to_use: $('span#HowToUse p')?.text().trim() || "",
technical_description: $('span#TechnicalDescription p')?.text().trim() || "",
product_description: $('span#ProductDescription p')?.text().trim() || "",
bought_with: [...$('a.redirectProductId')].map(item => $(item).attr('href').match(/=(\d+)$/)[1]),
rank: $('div.productAverageMainDiv').find('i.activeStar').length,
reviews_count: parseInt($('span#spnReviewCount').text()),
reviews: [...$('div.customerReviewsMainDiv')].map(review => {
const $review = $(review)
return {
id: ++review_id,
author: $review.find('div.customerName').text().trim(),
posted_at: $review.find('div.starIconsForReviews span').text().trim(),
rank: $review.find('span.productAverageMainDiv').find('i.activeStar').length,
message: $review.find('div.customerReviewDetail span').text().trim()
}
})
}
const imagesPromises = [...$('img#imgProduct')].map(image => {
const $image = $(image),
source = $image.attr('src')
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => 'product_default.jpg'
)
})
const otherImagesPromises = [...$('img.productDetailOtherIMG')].map(image => {
const $image = $(image),
source = $image.attr('src')
// Check if the other image is not a default one
if (/default_\d{1,2}/.test(source)) return null
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => null
)
})
return Promise.all(imagesPromises)
.then(images => {
result.images = images
return Promise.all(otherImagesPromises)
})
.then(otherImages => {
result.other_images = otherImages.filter(other_image => other_image !== null)
return result
})
}
scrapeProduct(code).then(product => console.log(product))
我正在尝试构建一个网络抓取工具来获取有关某些产品的信息并将它们存储在数据库中。我正在使用 Nightmare 获取 HTML 源代码(因为在创建页面内容之前 javascript 代码必须在服务器上 运行 ),然后我正在使用 Cheerio 解析该源代码。进行解析后,我必须为产品下载一些图像。我有一个简单的下载功能,并且根据我要下载的图像在服务器上是否可用,我想 return 一个字符串(或一个字符串数组)包含图片名称(我下载的)或我电脑的默认图片名称。我尝试调用下载函数作为承诺,当我知道有多个图像要下载时,我尝试传递 Promise.all() ,但无济于事。虽然我确信我的代码正在运行(图像按应有的方式下载,最终对象几乎在每个 属性 和值上看起来都很棒),但当我将对象打印到控制台,仍然持有 [Promise] / [ Promise { } ] 我不太确定如何解决这个问题。我很肯定这些承诺会解决,但是当我将结果对象输出到控制台时它们并没有解决。这是个问题,因为我必须传递要存储在数据库中的对象,而且我认为它们不会得到解决。
代码(减去确切链接)在下面:
const cheerio = require('cheerio')
const nightmare = require('nightmare')()
const download = require('image-downloader')
const settings = new function() {
this.baseURL = 'https://baseurl.whatever'
this.urlSearch = `${this.baseURL}/Product/Search?keyword=`
this.urlVariant = 'https://cdn.baseurl.whatever/Variant/'
this.urlProduct = 'https://cdn.baseurl.whatever/Product/'
this.imgPath = './img/'
}
var review_id = 0
function downloadImage(url, filepath, success, error) {
return download.image({ url, dest: filepath }).then(success, error)
}
const url = 'https://someurl.nevermind.meh/product?pid=50M3NUMB3R',
code = '50M3C0D3'
async function scrapeProduct(code) {
const product = await nightmare.goto(url)
.wait()
.evaluate(() => document.body.innerHTML)
.end()
.then(body => console.log(loadProduct(body, code)))
.catch(err => console.log(`There was an error: [${err}]`))
}
function loadProduct(body, code) {
$ = cheerio.load(body)
return {
title: $('li.LongName').text().trim(),
category: $('a#categoryTitleLink').text().trim(),
min_price: parseFloat($('span.MinPrice').text()),
max_price: parseFloat($('span.MaxPrice')?.text()) || parseFloat($('span.MinPrice').text()),
points: parseFloat($('div.AddtoCartUnderText span').text()),
variants: [...$('div.productDetailClassicRnd')].map(variant => {
const $field = $(variant).find('input'),
item_code = $field.attr('item_code')
if (item_code.split('-')[0] == code) return null
return {
code: item_code.split('-')[0],
title: $field.attr('item_name'),
image: downloadImage(
`${settings.urlVariant}${item_code.replace(' ', '%20')}`,
`${settings.imgPath}${item_code}`,
result => result.filename.split('/').reverse()[0],
_ => 'variant_default-VC.jpg'
)
}
}).filter(variant => variant !== null),
images: [...$('img#imgProduct')].map(image => {
const $image = $(image),
source = $image.attr('src')
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => 'product_default.jpg'
)
}),
other_images: [...$('img.productDetailOtherIMG')].map(image => {
const $image = $(image),
source = $image.attr('src')
// Check if the other image is not a default one
if (/default_\d{1,2}/.test(source)) return null
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => null
)
}).filter(other_image => other_image !== null),
how_to_use: $('span#HowToUse p')?.text().trim() || "",
technical_description: $('span#TechnicalDescription p')?.text().trim() || "",
product_description: $('span#ProductDescription p')?.text().trim() || "",
bought_with: [...$('a.redirectProductId')].map(item => $(item).attr('href').match(/=(\d+)$/)[1]),
rank: $('div.productAverageMainDiv').find('i.activeStar').length,
reviews_count: parseInt($('span#spnReviewCount').text()),
reviews: [...$('div.customerReviewsMainDiv')].map(review => {
const $review = $(review)
return {
id: ++review_id,
author: $review.find('div.customerName').text().trim(),
posted_at: $review.find('div.starIconsForReviews span').text().trim(),
rank: $review.find('span.productAverageMainDiv').find('i.activeStar').length,
message: $review.find('div.customerReviewDetail span').text().trim()
}
})
}
}
scrapeProduct(code)
我什至无法从生成的图像名称数组中过滤空值,因为一旦我到达过滤函数,这些承诺就不会解决。不知何故,我的印象是
images: downloadImage(
URL,
filepath,
resolve() {},
reject() {}
)
将等到 downloadImage 函数 return 为图像 属性 赋值,然后执行过滤函数。另一方面,鉴于我猜测在我的 downloadImage 函数有任何机会解决承诺之前很久执行就流向了 filter 函数,我会链接一个 .then() 到 downloadImage,但我不能,因为 downloadImage位于 map() 函数的 return 内 - 这是代码中 .filter() 函数后跟的那个。
如有任何帮助,我们将不胜感激!谢谢!
P.S.: 我很确定我正在监督一些基本的(合乎逻辑的)东西,或者我没有正确理解,我很抱歉浪费你的时间,但我正在努力解决这个问题搞了两天,好像没什么想法了^_^
您将获得 map() 方法返回的 Promise 数组,因此您需要使用 Promise.all() 或其变体之一。
例如,这里你得到了“images”的promises数组,然后你使用Promise.all()等待所有的promises被解决,最后你链接一个then()来使用值。
const imagesPromises = [...$('img#imgProduct')].map(image => {
const $image = $(image),
source = $image.attr('src')
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => 'product_default.jpg'
)
})
return Promise.all(imagesPromises)
.then(images => {
return {
images,
...
}
})
这里有一个可能的实现:
function loadProduct(body, code) {
$ = cheerio.load(body)
const result = {
title: $('li.LongName').text().trim(),
category: $('a#categoryTitleLink').text().trim(),
min_price: parseFloat($('span.MinPrice').text()),
max_price: parseFloat($('span.MaxPrice')?.text()) || parseFloat($('span.MinPrice').text()),
points: parseFloat($('div.AddtoCartUnderText span').text()),
variants: [...$('div.productDetailClassicRnd')].map(variant => {
const $field = $(variant).find('input'),
item_code = $field.attr('item_code')
if (item_code.split('-')[0] == code) return null
return {
code: item_code.split('-')[0],
title: $field.attr('item_name'),
image: downloadImage(
`${settings.urlVariant}${item_code.replace(' ', '%20')}`,
`${settings.imgPath}${item_code}`,
result => result.filename.split('/').reverse()[0],
_ => 'variant_default-VC.jpg'
)
}
}).filter(variant => variant !== null),
how_to_use: $('span#HowToUse p')?.text().trim() || "",
technical_description: $('span#TechnicalDescription p')?.text().trim() || "",
product_description: $('span#ProductDescription p')?.text().trim() || "",
bought_with: [...$('a.redirectProductId')].map(item => $(item).attr('href').match(/=(\d+)$/)[1]),
rank: $('div.productAverageMainDiv').find('i.activeStar').length,
reviews_count: parseInt($('span#spnReviewCount').text()),
reviews: [...$('div.customerReviewsMainDiv')].map(review => {
const $review = $(review)
return {
id: ++review_id,
author: $review.find('div.customerName').text().trim(),
posted_at: $review.find('div.starIconsForReviews span').text().trim(),
rank: $review.find('span.productAverageMainDiv').find('i.activeStar').length,
message: $review.find('div.customerReviewDetail span').text().trim()
}
})
}
const imagesPromises = [...$('img#imgProduct')].map(image => {
const $image = $(image),
source = $image.attr('src')
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => 'product_default.jpg'
)
})
const otherImagesPromises = [...$('img.productDetailOtherIMG')].map(image => {
const $image = $(image),
source = $image.attr('src')
// Check if the other image is not a default one
if (/default_\d{1,2}/.test(source)) return null
return downloadImage(
source,
`${settings.imgPath}${source.split('/').reverse()[0]}`,
result => result.filename.split('/').reverse()[0],
_ => null
)
})
return Promise.all(imagesPromises)
.then(images => {
result.images = images
return Promise.all(otherImagesPromises)
})
.then(otherImages => {
result.other_images = otherImages.filter(other_image => other_image !== null)
return result
})
}
scrapeProduct(code).then(product => console.log(product))