如何从嵌套数组中获得承诺?
How to get promises from nested arrays?
有人可以帮我解决这个问题吗?
我正在尝试抓取网站并将收集到的数据存储在 Json 文件中。我正在使用 cheerios 和请求承诺。
Json 结构是这样的:公司 > 套餐 > 城市
"companies": [
{
"id": 0,
"name": "companyName",
"url": "https://www.url-company.net/",
"packages": [
{
"id": 0,
"name": "general",
"url": "https://www.url-company.net/package",
"cities": [
{
"id": 0,
"name": "cityName",
"url": "https://www.url-company.net/package/city",
},
...]
}
...]
}
..]
我已经从该站点提取了一系列公司。
- 每个公司都有特定的 url --> 我从每个 url 中抓取了
每个公司的包裹。
- 每个 PACKAGE 都有一个特定的 url --> 来自
每个 url 我都想 抓取 每个包裹的城市,但我不是
能做到。
我只能填充公司和 packagesByCompany,但在尝试填充 citiesByPackage 时我迷路了:
const rp = require('request-promise');
const cheerio = require('cheerio');
const jsonfile = require('jsonfile');
const baseUrl = 'https://www.base-url-example.net';
scrapeAll();
function scrapeAll() {
return scrapeCompanies().then(function (dataCompanys) {
//Map every endpoint so we can make a request with each URL
var promises = dataCompanys.map(function (company) {
return scrapePackagesByCompany(company) // Populate each company with all the array of packages from this company
});
return Promise.all(promises);
})
.then(function(promiseArray) { // Need help here!!!!
var promise4all = Promise.all(
promiseArray.map(function(company) {
return Promise.all( // This is NOT working, I do not know how to get promises from nested arrays
company.packages.map(function(package) {
return Promise.all(
scrapeCitiesByPackage(package) // Try to populate each package with all the array of cities from this package
);
})
);
})
);
return promise4all;
})
.then(function (data) {
saveScrapedDateIntoJsonFile(data);
return data;
})
.catch(function (err) {
return Promise.reject(err);
});
}
function scrapeCompanies() {
return rp(baseUrl)
.then(function(html){
const data = [];
let companysImg = '#content section .elementor-container > .elementor-row > .elementor-element.elementor-top-column .elementor-widget-wrap .elementor-widget-image >.elementor-widget-container > .elementor-image';
let $ = cheerio.load(html);
$(companysImg).each(function(index, element){
const urlCompany = $(element).find('a').attr('href');
const imgCompany = $(element).find('img').data('lazy-src');
if (urlCompany && imgCompany) {
const nameCompany = urlCompany;
const company = {
id : index,
name: nameCompany,
url : baseUrl + urlCompany,
img: imgCompany,
};
data.push(company);
}
});
return data;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function scrapePackagesByCompany(company) {
return rp(company.url)
.then(function(html){
company.packages = [];
let packagesImg = '#content section .elementor-container > .elementor-row > .elementor-element.elementor-top-column .elementor-widget-wrap .elementor-widget-image >.elementor-widget-container > .elementor-image';
let $ = cheerio.load(html);
$(packagesImg).each(function(index, element){
const urlPackage = $(element).find('a').attr('href');
const imgPackage = $(element).find('img').data('lazy-src');
if (urlPackage && imgPackage) {
const namePackage = urlPackage.text();
const package = {
id : index,
name: namePackage,
url : urlPackage,
img: imgPackage,
};
company.packages.push(package);
}
});
return company;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function scrapeCitiesByPackage(insurancePackage) {
return rp(insurancePackage.url)
.then(function(html){
insurancePackage.cities = [];
let citiesLinks = '#content section .elementor-container > .elementor-row > .elementor-element .elementor-widget.elementor-widget-posts .elementor-posts-container article';
let $ = cheerio.load(html);
$(citiesLinks).each(function(index, element) {
const $linkCity = $(element).find('a');
const urlCity = $linkCity.attr('href');
const nameCity = $linkCity.text();
if (urlCity && nameCity) {
const city = {
id : index,
name: nameCity,
url : urlCity,
};
insurancePackage.cities.push(city);
}
});
return insurancePackage;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function saveScrapedDateIntoJsonFile(data) {
jsonfile.writeFile(
'./data/company.json',
{companies : data },
//data,
{spaces: 2},
function(err) {
console.error('errorrr', err);
});
}
提前致谢:)
您正在尝试的可能会起作用,但可以说 scrapePackagesByCompany()
和 scrapeCitiesByPackage()
只是传送数据并执行所有 "assembly" 工作(即捆绑在 scrapeAll()
中将数组传递到更高级别的对象中)。
你可以这样写:
scrapeAll()
.catch(function(err) {
console.log(err);
});
function scrapeAll() {
return scrapeCompanies()
.then(function(companies) {
return Promise.all(companies.map(function(company) {
return scrapePackagesByCompany(company)
.then(function(packages) {
company.packages = packages; // assembly
return Promise.all(packages.map(function(package) {
return scrapeCitiesByPackage(package)
.then(function(cities) {
package.cities = cities; // assembly
});
}));
});
}))
.then(function() {
return saveScrapedDateIntoJsonFile(companies);
});
});
}
那么简化 scrapePackagesByCompany()
和 scrapeCitiesByPackage(package)
就很简单了,它们分别提供 packages
数组和 cities
数组。
有人可以帮我解决这个问题吗?
我正在尝试抓取网站并将收集到的数据存储在 Json 文件中。我正在使用 cheerios 和请求承诺。
Json 结构是这样的:公司 > 套餐 > 城市
"companies": [
{
"id": 0,
"name": "companyName",
"url": "https://www.url-company.net/",
"packages": [
{
"id": 0,
"name": "general",
"url": "https://www.url-company.net/package",
"cities": [
{
"id": 0,
"name": "cityName",
"url": "https://www.url-company.net/package/city",
},
...]
}
...]
}
..]
我已经从该站点提取了一系列公司。
- 每个公司都有特定的 url --> 我从每个 url 中抓取了 每个公司的包裹。
- 每个 PACKAGE 都有一个特定的 url --> 来自 每个 url 我都想 抓取 每个包裹的城市,但我不是 能做到。
我只能填充公司和 packagesByCompany,但在尝试填充 citiesByPackage 时我迷路了:
const rp = require('request-promise');
const cheerio = require('cheerio');
const jsonfile = require('jsonfile');
const baseUrl = 'https://www.base-url-example.net';
scrapeAll();
function scrapeAll() {
return scrapeCompanies().then(function (dataCompanys) {
//Map every endpoint so we can make a request with each URL
var promises = dataCompanys.map(function (company) {
return scrapePackagesByCompany(company) // Populate each company with all the array of packages from this company
});
return Promise.all(promises);
})
.then(function(promiseArray) { // Need help here!!!!
var promise4all = Promise.all(
promiseArray.map(function(company) {
return Promise.all( // This is NOT working, I do not know how to get promises from nested arrays
company.packages.map(function(package) {
return Promise.all(
scrapeCitiesByPackage(package) // Try to populate each package with all the array of cities from this package
);
})
);
})
);
return promise4all;
})
.then(function (data) {
saveScrapedDateIntoJsonFile(data);
return data;
})
.catch(function (err) {
return Promise.reject(err);
});
}
function scrapeCompanies() {
return rp(baseUrl)
.then(function(html){
const data = [];
let companysImg = '#content section .elementor-container > .elementor-row > .elementor-element.elementor-top-column .elementor-widget-wrap .elementor-widget-image >.elementor-widget-container > .elementor-image';
let $ = cheerio.load(html);
$(companysImg).each(function(index, element){
const urlCompany = $(element).find('a').attr('href');
const imgCompany = $(element).find('img').data('lazy-src');
if (urlCompany && imgCompany) {
const nameCompany = urlCompany;
const company = {
id : index,
name: nameCompany,
url : baseUrl + urlCompany,
img: imgCompany,
};
data.push(company);
}
});
return data;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function scrapePackagesByCompany(company) {
return rp(company.url)
.then(function(html){
company.packages = [];
let packagesImg = '#content section .elementor-container > .elementor-row > .elementor-element.elementor-top-column .elementor-widget-wrap .elementor-widget-image >.elementor-widget-container > .elementor-image';
let $ = cheerio.load(html);
$(packagesImg).each(function(index, element){
const urlPackage = $(element).find('a').attr('href');
const imgPackage = $(element).find('img').data('lazy-src');
if (urlPackage && imgPackage) {
const namePackage = urlPackage.text();
const package = {
id : index,
name: namePackage,
url : urlPackage,
img: imgPackage,
};
company.packages.push(package);
}
});
return company;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function scrapeCitiesByPackage(insurancePackage) {
return rp(insurancePackage.url)
.then(function(html){
insurancePackage.cities = [];
let citiesLinks = '#content section .elementor-container > .elementor-row > .elementor-element .elementor-widget.elementor-widget-posts .elementor-posts-container article';
let $ = cheerio.load(html);
$(citiesLinks).each(function(index, element) {
const $linkCity = $(element).find('a');
const urlCity = $linkCity.attr('href');
const nameCity = $linkCity.text();
if (urlCity && nameCity) {
const city = {
id : index,
name: nameCity,
url : urlCity,
};
insurancePackage.cities.push(city);
}
});
return insurancePackage;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function saveScrapedDateIntoJsonFile(data) {
jsonfile.writeFile(
'./data/company.json',
{companies : data },
//data,
{spaces: 2},
function(err) {
console.error('errorrr', err);
});
}
提前致谢:)
您正在尝试的可能会起作用,但可以说 scrapePackagesByCompany()
和 scrapeCitiesByPackage()
只是传送数据并执行所有 "assembly" 工作(即捆绑在 scrapeAll()
中将数组传递到更高级别的对象中)。
你可以这样写:
scrapeAll()
.catch(function(err) {
console.log(err);
});
function scrapeAll() {
return scrapeCompanies()
.then(function(companies) {
return Promise.all(companies.map(function(company) {
return scrapePackagesByCompany(company)
.then(function(packages) {
company.packages = packages; // assembly
return Promise.all(packages.map(function(package) {
return scrapeCitiesByPackage(package)
.then(function(cities) {
package.cities = cities; // assembly
});
}));
});
}))
.then(function() {
return saveScrapedDateIntoJsonFile(companies);
});
});
}
那么简化 scrapePackagesByCompany()
和 scrapeCitiesByPackage(package)
就很简单了,它们分别提供 packages
数组和 cities
数组。