NodeJS Cheerio 库分页网页抓取
NodeJS Cheerio library pagination web scraping
我希望你在这种情况下一切顺利。我对使用 nodejs 和库 cheerio 进行网络抓取(分页)有疑问。已经完成了一些代码,但它只抓取了一页,我已经搜索了几个小时的解决方案,尝试按照一些步骤进行操作,但结果相同,它只抓取了一页。感谢您的任何答复,这是代码:
const request = require('request-promise')
const cheerio = require('cheerio')
const fs = require('fs')
const baseUrl = 'https://indotrading.com/company_hdpe_620' // the website i want to scrape
const outputFile = 'data.csv'
const parsedResults = []
var indexPage = 1
var totalPage = 0
const getWebsiteContent = async (url) => {
try {
request(url).then(function(body){
const $ = cheerio.load(body)
let page = $('.footer-page').children().children().last().children().prop("href") //get last page navigation button
page = page.split("/")
totalPage = page[page.length-1] //total page that website has
//get some data from HTML attribute
$('#products_container #catcom-container').each((key,element) => {
const linkImage = $(element).find('.swiper-wrapper').children().children().children().prop('data-src')
const companyName = $(element).find('.product_title').text().replace(/\n+/g,'')
const companyAddress = $(element).find('i.fa.fa-map-marker.fs-18.mr-5').parent().find('p.d-flex.a-center').text().replace(/\s/,'')
const splitLinkImage = linkImage.split("/")
const companyID = splitLinkImage[splitLinkImage.indexOf("webp")+1]
//calling function phone data based on company id
const getdataPhone = getPhoneData(companyID)
getdataPhone.then(function(result) { // please check this one, is the promise correct?
const listCompanyPhone = JSON.parse(result.d)
const companyPhone = listCompanyPhone.Phone+" , "+listCompanyPhone.Phone2
const Company = {
Name : companyName,
Phone: companyPhone,
Address: companyAddress
}
parsedResults.push(Company)
exportResults(parsedResults)
})
})
})
const nextPageLink = baseUrl+'/'+(++indexPage) // get next page
indexPage++
if(indexPage == totalPage){
exportResults(parsedResults) // exports to csv but not work
return false
}
getWebsiteContent(nextPageLink) //it will not recursive
} catch (error) {
console.log(error)
}
}
//function for get data by calling api and it returns json
function getPhoneData(data) {
try {
var options = {
method : 'POST',
uri : 'https://www.indotrading.com/AjaxMethod.asmx/UpdateCompanyPhoneLeads',
body : {
Token : "EAAAAKTheWTVifIaYce5HmctJuDKNQO5nbySwS3GGi14hbcy0oGq3yqxMhd5sE6349byCw==",
EncCompanyID : data,
ProductID : "undefined"
},
headers : {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Content-Type': 'application/json'
},
json: true
}
return request(options).then(function(body){
return body
}).catch(function(error){
console.log(error)
})
} catch (error) {
console.log("get phone data error : "+error)
}
}
//function for export to csv file
const exportResults = (parsedResults) => {
fs.writeFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
if (err) {
console.log(err)
}
console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`)
})
}
getWebsiteContent(baseUrl)
我该如何解决这个问题?我只想抓取所有存在的页面
我正在做类似的事情,我建议你这样做....
请求现已弃用,请改用 axios
const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')
const baseUrl = 'https://example.com' // the website url to start scraping from
var parsedResults = [];
const outputFile = 'data.csv'
var saved = false // Added this for monitoring if the scraped data was saved if an error is thrown
var indexPage = 1
var totalPages = 1;
const getWebsiteContent = async (url) => {
try {
axios.get(url).then( res => {
const $ = cheerio.load(res.data)
totalPages = getTotalpages($); // Get the pagination
// Now we have the total pages for the url you want to scrap
// Next we scrape all the data on the respective pages
// Add your code here that scrapes the data
});
})
.catch(err => {
throw(err);
});
indexPage++; // Increment to the next page
if (indexPage == totalPages) {
exportResults(parsedResults) // If we have surpassed the total pages we export the result to CSV
return false
}
const nextPageLink = baseUrl + '......' + indexPage; // get next page
// Add a little timeout to avoid getting banned by the server
setTimeout(() => {
getWebsiteContent(nextPageLink); // Call itself
}, 3000);
}
catch (error) {
console.log(error)
}
finally{
// If results were written successfully to file the end else write whats in memory
if(!saved){
exportResults(parsedResults) ;
}
}
}
// Get the pagination
function getTotalpages(data){
// Extract the total number of pages available and return it as an integer
}
//function for export to csv file
const exportResults = (parsedResults) => {
fs.appendFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
if (err) {
console.log(err)
}
console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`);
saved = true;
})
}
getWebsiteContent(baseUrl);
我希望你在这种情况下一切顺利。我对使用 nodejs 和库 cheerio 进行网络抓取(分页)有疑问。已经完成了一些代码,但它只抓取了一页,我已经搜索了几个小时的解决方案,尝试按照一些步骤进行操作,但结果相同,它只抓取了一页。感谢您的任何答复,这是代码:
const request = require('request-promise')
const cheerio = require('cheerio')
const fs = require('fs')
const baseUrl = 'https://indotrading.com/company_hdpe_620' // the website i want to scrape
const outputFile = 'data.csv'
const parsedResults = []
var indexPage = 1
var totalPage = 0
const getWebsiteContent = async (url) => {
try {
request(url).then(function(body){
const $ = cheerio.load(body)
let page = $('.footer-page').children().children().last().children().prop("href") //get last page navigation button
page = page.split("/")
totalPage = page[page.length-1] //total page that website has
//get some data from HTML attribute
$('#products_container #catcom-container').each((key,element) => {
const linkImage = $(element).find('.swiper-wrapper').children().children().children().prop('data-src')
const companyName = $(element).find('.product_title').text().replace(/\n+/g,'')
const companyAddress = $(element).find('i.fa.fa-map-marker.fs-18.mr-5').parent().find('p.d-flex.a-center').text().replace(/\s/,'')
const splitLinkImage = linkImage.split("/")
const companyID = splitLinkImage[splitLinkImage.indexOf("webp")+1]
//calling function phone data based on company id
const getdataPhone = getPhoneData(companyID)
getdataPhone.then(function(result) { // please check this one, is the promise correct?
const listCompanyPhone = JSON.parse(result.d)
const companyPhone = listCompanyPhone.Phone+" , "+listCompanyPhone.Phone2
const Company = {
Name : companyName,
Phone: companyPhone,
Address: companyAddress
}
parsedResults.push(Company)
exportResults(parsedResults)
})
})
})
const nextPageLink = baseUrl+'/'+(++indexPage) // get next page
indexPage++
if(indexPage == totalPage){
exportResults(parsedResults) // exports to csv but not work
return false
}
getWebsiteContent(nextPageLink) //it will not recursive
} catch (error) {
console.log(error)
}
}
//function for get data by calling api and it returns json
function getPhoneData(data) {
try {
var options = {
method : 'POST',
uri : 'https://www.indotrading.com/AjaxMethod.asmx/UpdateCompanyPhoneLeads',
body : {
Token : "EAAAAKTheWTVifIaYce5HmctJuDKNQO5nbySwS3GGi14hbcy0oGq3yqxMhd5sE6349byCw==",
EncCompanyID : data,
ProductID : "undefined"
},
headers : {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Content-Type': 'application/json'
},
json: true
}
return request(options).then(function(body){
return body
}).catch(function(error){
console.log(error)
})
} catch (error) {
console.log("get phone data error : "+error)
}
}
//function for export to csv file
const exportResults = (parsedResults) => {
fs.writeFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
if (err) {
console.log(err)
}
console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`)
})
}
getWebsiteContent(baseUrl)
我该如何解决这个问题?我只想抓取所有存在的页面
我正在做类似的事情,我建议你这样做....
请求现已弃用,请改用 axios
const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')
const baseUrl = 'https://example.com' // the website url to start scraping from
var parsedResults = [];
const outputFile = 'data.csv'
var saved = false // Added this for monitoring if the scraped data was saved if an error is thrown
var indexPage = 1
var totalPages = 1;
const getWebsiteContent = async (url) => {
try {
axios.get(url).then( res => {
const $ = cheerio.load(res.data)
totalPages = getTotalpages($); // Get the pagination
// Now we have the total pages for the url you want to scrap
// Next we scrape all the data on the respective pages
// Add your code here that scrapes the data
});
})
.catch(err => {
throw(err);
});
indexPage++; // Increment to the next page
if (indexPage == totalPages) {
exportResults(parsedResults) // If we have surpassed the total pages we export the result to CSV
return false
}
const nextPageLink = baseUrl + '......' + indexPage; // get next page
// Add a little timeout to avoid getting banned by the server
setTimeout(() => {
getWebsiteContent(nextPageLink); // Call itself
}, 3000);
}
catch (error) {
console.log(error)
}
finally{
// If results were written successfully to file the end else write whats in memory
if(!saved){
exportResults(parsedResults) ;
}
}
}
// Get the pagination
function getTotalpages(data){
// Extract the total number of pages available and return it as an integer
}
//function for export to csv file
const exportResults = (parsedResults) => {
fs.appendFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
if (err) {
console.log(err)
}
console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`);
saved = true;
})
}
getWebsiteContent(baseUrl);