Node.js/Axios/Cheerio Web 抓取 - Promises 问题
Node.js/Axios/Cheerio Web Scraping - issue with Promises
我的部分网络抓取程序有问题。这
index.js 中的 return res.send(statsArray)
行始终 return 在初始 运行 上是一个空数组(使用 npm start),并且只会 return 在 至少 刷新一次。
这里是相关的index.html(如果需要的话):
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Web Scraping App</title>
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="src/styles.css">
</head>
<body>
<script src="src/app.js" async defer></script>
</body>
</html>
这是我的 app.js,由 index.html 链接的文件:
//Get players names, links, and stats
fetch('http://localhost:8000/players')
.then(response => {return response.json()})
.then(data => {
console.log(data)
}).catch(err=>console.log(err))
这是我的相关部分 index.js:
const PORT = 8000
const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const cors = require('cors')
const app = express()
app.use(cors())
app.listen(PORT , () => console.log(`server running on PORT ${PORT}`))
const players = 'https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster'
const playerStats = 'https://www.trinethunder.com'
const playerName = []
const playerLink = []
app.get('/players', (req, res) => {
function getPlayers(){
return new Promise((resolve, reject) => {
axios(players)
.then(response => {
const html = response.data
const $ = cheerio.load(html)
//const error = false;
$('td.text.pinned-col > a', html).each(function(){
var player = $(this).text()
player = player.replace(/\s\s+/g, ' ').trim();
//if name not yet in array, push to array
if(playerName.indexOf(player) === -1){
playerName.push(player)
}
})
$('td.text.pinned-col > a', html).each(function(){
var link = $(this).attr('href')
//if link not yet in array, push to array
if(playerLink.indexOf(playerStats+link) === -1){
playerLink.push(playerStats+link)
}
})
console.log(playerLink)
/*if (!error){
resolve()
} else {
reject('Error: something went wrong')
}*/
})
})
}
function getPlayerStats(){
setTimeout(()=>{
for(let i=0; i<playerLink.length; i++){
axios.get(playerLink[i])
.then(response => {
const html = response.data
const $ = cheerio.load(html)
const statName = []
const statDesc = []
const statNum = []
$('h2 > span:nth-child(1)', html).each(function(){
var name = $(this).text()
statName.push(name)
})
$('.stat-title', html).each(function(){
var stat1 = $(this).text()
statDesc.push(stat1)
})
$('.stat-value', html).each(function(){
var stat2 = $(this).text()
statNum.push(stat2)
})
//Conditional is here because sometimes statsArray
//gets filled multiple times
if(statsArray.length <63){
statsArray.push(statName, statDesc, statNum)
}
}).catch(err => console.log(err))
}
return res.send(statsArray)
}, 3000)
}
getPlayers()
.then(getPlayerStats())
.catch(err => console.log(err))
})
我一直在想办法遍历每个url;使用 Promise.all、return 新承诺、async/await 关键字等。这种方法使我最接近我想要的结果,但如果有更好的方法,请告诉我。
我只需要能够在第一次尝试时得到结果。我使用 Promise 的方式肯定有问题;我抓取的所有其他数据都是 returned 而没有刷新,并且它们不使用 promises。
感谢您的帮助!
我无法从“https://www.trinethunder.com”站点提取数据,因为我的 IP 出现 403 错误,但理论上这些代码中的更正应该有所帮助。我唯一不明白的是如果你不使用它们为什么会得到 playerName 数组。
const PORT = 8000;
const axios = require("axios");
const cheerio = require("cheerio");
const express = require("express");
const cors = require("cors");
const app = express();
app.use(cors());
app.listen(PORT, () => console.log(`server running on PORT ${PORT}`));
const players = "https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster";
const playerStats = "https://www.trinethunder.com";
app.get("/players", (req, res) => {
function getPlayers() {
return new Promise((resolve, reject) => {
axios(players)
.then((response) => {
const playerName = [];
const playerLink = [];
const html = response.data;
const $ = cheerio.load(html);
//const error = false;
$("td.text.pinned-col > a", html).each(function () {
var player = $(this).text();
player = player.replace(/\s\s+/g, " ").trim();
//if name not yet in array, push to array
if (playerName.indexOf(player) === -1) {
playerName.push(player);
}
});
$("td.text.pinned-col > a", html).each(function () {
var link = $(this).attr("href");
//if link not yet in array, push to array
if (playerLink.indexOf(playerStats + link) === -1) {
playerLink.push(playerStats + link);
}
});
resolve(playerLink);
})
.catch((err) => {
console.log(err);
});
});
}
function getPlayerStats(playerLink) {
const statsArray = [];
setTimeout(async () => {
for (let i = 0; i < playerLink.length; i++) {
await new Promise((resolve, reject) => {
axios
.get(playerLink[i])
.then((response) => {
const html = response.data;
const $ = cheerio.load(html);
const statName = [];
const statDesc = [];
const statNum = [];
$("h2 > span:nth-child(1)", html).each(function () {
var name = $(this).text();
statName.push(name);
});
$(".stat-title", html).each(function () {
var stat1 = $(this).text();
statDesc.push(stat1);
});
$(".stat-value", html).each(function () {
var stat2 = $(this).text();
statNum.push(stat2);
});
//Conditional is here because sometimes statsArray
//gets filled multiple times
if (statsArray.length < 63) {
statsArray.push(statName, statDesc, statNum);
}
resolve();
})
.catch((err) => console.log(err));
});
}
return res.send(JSON.stringify(statsArray));
}, 3000);
}
getPlayers()
.then(getPlayerStats)
.catch((err) => console.log(err));
});
我的部分网络抓取程序有问题。这
index.js 中的 return res.send(statsArray)
行始终 return 在初始 运行 上是一个空数组(使用 npm start),并且只会 return 在 至少 刷新一次。
这里是相关的index.html(如果需要的话):
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Web Scraping App</title>
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="src/styles.css">
</head>
<body>
<script src="src/app.js" async defer></script>
</body>
</html>
这是我的 app.js,由 index.html 链接的文件:
//Get players names, links, and stats
fetch('http://localhost:8000/players')
.then(response => {return response.json()})
.then(data => {
console.log(data)
}).catch(err=>console.log(err))
这是我的相关部分 index.js:
const PORT = 8000
const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const cors = require('cors')
const app = express()
app.use(cors())
app.listen(PORT , () => console.log(`server running on PORT ${PORT}`))
const players = 'https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster'
const playerStats = 'https://www.trinethunder.com'
const playerName = []
const playerLink = []
app.get('/players', (req, res) => {
function getPlayers(){
return new Promise((resolve, reject) => {
axios(players)
.then(response => {
const html = response.data
const $ = cheerio.load(html)
//const error = false;
$('td.text.pinned-col > a', html).each(function(){
var player = $(this).text()
player = player.replace(/\s\s+/g, ' ').trim();
//if name not yet in array, push to array
if(playerName.indexOf(player) === -1){
playerName.push(player)
}
})
$('td.text.pinned-col > a', html).each(function(){
var link = $(this).attr('href')
//if link not yet in array, push to array
if(playerLink.indexOf(playerStats+link) === -1){
playerLink.push(playerStats+link)
}
})
console.log(playerLink)
/*if (!error){
resolve()
} else {
reject('Error: something went wrong')
}*/
})
})
}
function getPlayerStats(){
setTimeout(()=>{
for(let i=0; i<playerLink.length; i++){
axios.get(playerLink[i])
.then(response => {
const html = response.data
const $ = cheerio.load(html)
const statName = []
const statDesc = []
const statNum = []
$('h2 > span:nth-child(1)', html).each(function(){
var name = $(this).text()
statName.push(name)
})
$('.stat-title', html).each(function(){
var stat1 = $(this).text()
statDesc.push(stat1)
})
$('.stat-value', html).each(function(){
var stat2 = $(this).text()
statNum.push(stat2)
})
//Conditional is here because sometimes statsArray
//gets filled multiple times
if(statsArray.length <63){
statsArray.push(statName, statDesc, statNum)
}
}).catch(err => console.log(err))
}
return res.send(statsArray)
}, 3000)
}
getPlayers()
.then(getPlayerStats())
.catch(err => console.log(err))
})
我一直在想办法遍历每个url;使用 Promise.all、return 新承诺、async/await 关键字等。这种方法使我最接近我想要的结果,但如果有更好的方法,请告诉我。
我只需要能够在第一次尝试时得到结果。我使用 Promise 的方式肯定有问题;我抓取的所有其他数据都是 returned 而没有刷新,并且它们不使用 promises。
感谢您的帮助!
我无法从“https://www.trinethunder.com”站点提取数据,因为我的 IP 出现 403 错误,但理论上这些代码中的更正应该有所帮助。我唯一不明白的是如果你不使用它们为什么会得到 playerName 数组。
const PORT = 8000;
const axios = require("axios");
const cheerio = require("cheerio");
const express = require("express");
const cors = require("cors");
const app = express();
app.use(cors());
app.listen(PORT, () => console.log(`server running on PORT ${PORT}`));
const players = "https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster";
const playerStats = "https://www.trinethunder.com";
app.get("/players", (req, res) => {
function getPlayers() {
return new Promise((resolve, reject) => {
axios(players)
.then((response) => {
const playerName = [];
const playerLink = [];
const html = response.data;
const $ = cheerio.load(html);
//const error = false;
$("td.text.pinned-col > a", html).each(function () {
var player = $(this).text();
player = player.replace(/\s\s+/g, " ").trim();
//if name not yet in array, push to array
if (playerName.indexOf(player) === -1) {
playerName.push(player);
}
});
$("td.text.pinned-col > a", html).each(function () {
var link = $(this).attr("href");
//if link not yet in array, push to array
if (playerLink.indexOf(playerStats + link) === -1) {
playerLink.push(playerStats + link);
}
});
resolve(playerLink);
})
.catch((err) => {
console.log(err);
});
});
}
function getPlayerStats(playerLink) {
const statsArray = [];
setTimeout(async () => {
for (let i = 0; i < playerLink.length; i++) {
await new Promise((resolve, reject) => {
axios
.get(playerLink[i])
.then((response) => {
const html = response.data;
const $ = cheerio.load(html);
const statName = [];
const statDesc = [];
const statNum = [];
$("h2 > span:nth-child(1)", html).each(function () {
var name = $(this).text();
statName.push(name);
});
$(".stat-title", html).each(function () {
var stat1 = $(this).text();
statDesc.push(stat1);
});
$(".stat-value", html).each(function () {
var stat2 = $(this).text();
statNum.push(stat2);
});
//Conditional is here because sometimes statsArray
//gets filled multiple times
if (statsArray.length < 63) {
statsArray.push(statName, statDesc, statNum);
}
resolve();
})
.catch((err) => console.log(err));
});
}
return res.send(JSON.stringify(statsArray));
}, 3000);
}
getPlayers()
.then(getPlayerStats)
.catch((err) => console.log(err));
});