如何使用 puppeteer js 抓取多级链接?
How to scrape multi-level links using puppeteer js?
我正在使用 Puppeteer 抓取 table 行网站页面。我有代码来抓取内容并将它们分配给 table 中的每个对象。在每个 table 行中有一个 link 我需要在一个新页面(puppeteer)中打开,然后抓取一个特定的元素然后将它分配给同一个对象和 return 整个带有新键的对象来操纵木偶。 Puppeteer 怎么可能?
async function run() {
const browser = await puppeteer.launch({
headless: false
})
const page = await browser.newPage()
await page.goto('https://tokenmarket.net/blockchain/', {waitUntil: 'networkidle0'})
await page.waitFor(5000)
var onlink = ''
var result = await page.$$eval('table > tbody tr .col-actions a:first-child', (els) => Array.from(els).map(function(el) {
//running ajax requests to load the inner page links.
$.get(el.children[0].href, function(response) {
onlink = $(response).find('#page-wrapper > main > div.container > div > table > tbody > tr > td:nth-child(2)').text()
})
return {
icoImgUrl: el.children[0].children[0].children[0].currentSrc,
icoDate: el.children[2].innerText.split('\n').shift() === 'To be announced' ? null : new Date( el.children[2].innerText.split('\n').shift() ).toISOString(),
icoName:el.children[1].children[0].innerText,
link:el.children[1].children[0].children[0].href,
description:el.children[3].innerText,
assets :onlink
}
}))
console.log(result)
UpcomingIco.insertMany(result, function(error, docs) {})
browser.close()
}
run()
如果您尝试为每个 ICO 页面并行打开一个新选项卡,您最终可能会同时加载 100 多个页面。
所以最好的办法是先收集 URL,然后循环访问它们。
这还可以使代码保持简单和可读性。
例如(请看我的评论):
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://tokenmarket.net/blockchain/');
// Gather assets page urls for all the blockchains
const assetUrls = await page.$$eval(
'.table-assets > tbody > tr .col-actions a:first-child',
assetLinks => assetLinks.map(link => link.href)
);
const results = [];
// Visit each assets page one by one
for (let assetsUrl of assetUrls) {
await page.goto(assetsUrl);
// Now collect all the ICO urls.
const icoUrls = await page.$$eval(
'#page-wrapper > main > div.container > div > table > tbody > tr > td:nth-child(2) a',
links => links.map(link => link.href)
);
// Visit each ICO one by one and collect the data.
for (let icoUrl of icoUrls) {
await page.goto(icoUrl);
const icoImgUrl = await page.$eval('#asset-logo-wrapper img', img => img.src);
const icoName = await page.$eval('h1', h1 => h1.innerText.trim());
// TODO: Gather all the needed info like description etc here.
results.push([{
icoName,
icoUrl,
icoImgUrl
}]);
}
}
// Results are ready
console.log(results);
browser.close();
我正在使用 Puppeteer 抓取 table 行网站页面。我有代码来抓取内容并将它们分配给 table 中的每个对象。在每个 table 行中有一个 link 我需要在一个新页面(puppeteer)中打开,然后抓取一个特定的元素然后将它分配给同一个对象和 return 整个带有新键的对象来操纵木偶。 Puppeteer 怎么可能?
async function run() {
const browser = await puppeteer.launch({
headless: false
})
const page = await browser.newPage()
await page.goto('https://tokenmarket.net/blockchain/', {waitUntil: 'networkidle0'})
await page.waitFor(5000)
var onlink = ''
var result = await page.$$eval('table > tbody tr .col-actions a:first-child', (els) => Array.from(els).map(function(el) {
//running ajax requests to load the inner page links.
$.get(el.children[0].href, function(response) {
onlink = $(response).find('#page-wrapper > main > div.container > div > table > tbody > tr > td:nth-child(2)').text()
})
return {
icoImgUrl: el.children[0].children[0].children[0].currentSrc,
icoDate: el.children[2].innerText.split('\n').shift() === 'To be announced' ? null : new Date( el.children[2].innerText.split('\n').shift() ).toISOString(),
icoName:el.children[1].children[0].innerText,
link:el.children[1].children[0].children[0].href,
description:el.children[3].innerText,
assets :onlink
}
}))
console.log(result)
UpcomingIco.insertMany(result, function(error, docs) {})
browser.close()
}
run()
如果您尝试为每个 ICO 页面并行打开一个新选项卡,您最终可能会同时加载 100 多个页面。
所以最好的办法是先收集 URL,然后循环访问它们。
这还可以使代码保持简单和可读性。
例如(请看我的评论):
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://tokenmarket.net/blockchain/');
// Gather assets page urls for all the blockchains
const assetUrls = await page.$$eval(
'.table-assets > tbody > tr .col-actions a:first-child',
assetLinks => assetLinks.map(link => link.href)
);
const results = [];
// Visit each assets page one by one
for (let assetsUrl of assetUrls) {
await page.goto(assetsUrl);
// Now collect all the ICO urls.
const icoUrls = await page.$$eval(
'#page-wrapper > main > div.container > div > table > tbody > tr > td:nth-child(2) a',
links => links.map(link => link.href)
);
// Visit each ICO one by one and collect the data.
for (let icoUrl of icoUrls) {
await page.goto(icoUrl);
const icoImgUrl = await page.$eval('#asset-logo-wrapper img', img => img.src);
const icoName = await page.$eval('h1', h1 => h1.innerText.trim());
// TODO: Gather all the needed info like description etc here.
results.push([{
icoName,
icoUrl,
icoImgUrl
}]);
}
}
// Results are ready
console.log(results);
browser.close();