如何在递归中使用多个承诺?
How to use multiple promises in recursion?
我正在尝试解决脚本进入网站的问题,从中获取前 10 个链接,然后继续处理这 10 个链接,然后继续处理在这 10 个前一页中的每一个页面上找到的下 10 个链接.直到访问的页面数达到 1000。
这是它的样子:
我试图通过在承诺和递归中使用 for 循环来获得它,这是我的代码:
const rp = require('request-promise');
const url = 'http://somewebsite.com/';
const websites = []
const promises = []
const getOnSite = (url, count = 0) => {
console.log(count, websites.length)
promises.push(new Promise((resolve, reject) => {
rp(url)
.then(async function (html) {
let links = html.match(/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/g)
if (links !== null) {
links = links.splice(0, 10)
}
websites.push({
url,
links,
emails: emails === null ? [] : emails
})
if (links !== null) {
for (let i = 0; i < links.length; i++) {
if (count < 3) {
resolve(getOnSite(links[i], count + 1))
} else {
resolve()
}
}
} else {
resolve()
}
}).catch(err => {
resolve()
})
}))
}
getOnSite(url)
我认为您可能需要一个带有三个参数的递归函数:
- 一个 url 数组,用于从
中提取链接
- 累积链接数组
- 停止抓取的时间限制
您可以通过仅使用根 url 调用它来启动它,并等待所有返回的承诺:
const allLinks = await Promise.all(crawl([rootUrl]));
在初始调用中,第二个和第三个参数可以采用默认值:
async function crawl (urls, accumulated = [], limit = 1000) {
...
}
该函数将获取每个 url,提取其链接,并递归直到达到限制。 我还没有测试过这些,但我在想一些事情:
// limit the number of links per page to 10
const perPageLimit = 10;
async function crawl (urls, accumulated = [], limit = 1000) {
// if limit has been depleted or if we don't have any urls,
// return the accumulated result
if (limit === 0 || urls.length === 0) {
return accumulated;
}
// process this set of links
const links = await Promise.all(
urls
.splice(0, perPageLimit) // limit to 10
.map(url => fetchHtml(url) // fetch the url
.then(extractUrls)); // and extract its links
);
// then recurse
return crawl(
links, // newly extracted array of links from this call
[...accumulated, links], // pushed onto the accumulated list
limit - links.length // reduce the limit and recurse
);
}
async fetchHtml (url) {
//
}
const extractUrls = (html) => html.match( ... )
我正在尝试解决脚本进入网站的问题,从中获取前 10 个链接,然后继续处理这 10 个链接,然后继续处理在这 10 个前一页中的每一个页面上找到的下 10 个链接.直到访问的页面数达到 1000。
这是它的样子:
const rp = require('request-promise');
const url = 'http://somewebsite.com/';
const websites = []
const promises = []
const getOnSite = (url, count = 0) => {
console.log(count, websites.length)
promises.push(new Promise((resolve, reject) => {
rp(url)
.then(async function (html) {
let links = html.match(/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/g)
if (links !== null) {
links = links.splice(0, 10)
}
websites.push({
url,
links,
emails: emails === null ? [] : emails
})
if (links !== null) {
for (let i = 0; i < links.length; i++) {
if (count < 3) {
resolve(getOnSite(links[i], count + 1))
} else {
resolve()
}
}
} else {
resolve()
}
}).catch(err => {
resolve()
})
}))
}
getOnSite(url)
我认为您可能需要一个带有三个参数的递归函数:
- 一个 url 数组,用于从 中提取链接
- 累积链接数组
- 停止抓取的时间限制
您可以通过仅使用根 url 调用它来启动它,并等待所有返回的承诺:
const allLinks = await Promise.all(crawl([rootUrl]));
在初始调用中,第二个和第三个参数可以采用默认值:
async function crawl (urls, accumulated = [], limit = 1000) {
...
}
该函数将获取每个 url,提取其链接,并递归直到达到限制。 我还没有测试过这些,但我在想一些事情:
// limit the number of links per page to 10
const perPageLimit = 10;
async function crawl (urls, accumulated = [], limit = 1000) {
// if limit has been depleted or if we don't have any urls,
// return the accumulated result
if (limit === 0 || urls.length === 0) {
return accumulated;
}
// process this set of links
const links = await Promise.all(
urls
.splice(0, perPageLimit) // limit to 10
.map(url => fetchHtml(url) // fetch the url
.then(extractUrls)); // and extract its links
);
// then recurse
return crawl(
links, // newly extracted array of links from this call
[...accumulated, links], // pushed onto the accumulated list
limit - links.length // reduce the limit and recurse
);
}
async fetchHtml (url) {
//
}
const extractUrls = (html) => html.match( ... )