我正在尝试在 nodejs 中抓取维基百科页面
I am trying to scrape a wikipedia page in nodejs
我正在尝试抓取维基百科页面 https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue。我的目标是将 table 中的所有 50 行存储到 csv 文件。
但是我做不到。请找到附件中的代码。
// Importing necessary modules
const request = require("request-promise")
const cheerio = require("cheerio")
const fs = require("fs")
const json2csv = require("json2csv").Parser
// page which i want to scrape
const wiki = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue";
(async () => {
// this will store the data
let data = [];
// making request
const response = await request({
uri: wiki,
headers: {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9"
},
gzip: true,
});
let $ = cheerio.load(response);
// Extracting rank and name of the company, it will be only for 1st row
// I will run a loop 50 times to get 50 rows. But below two lines are not working
// on testing this line on chrome console, it is giving correct output , but it is not working
// here, same thing for name
let rank = $('table[class="wikitable sortable jquery-tablesorter"] tbody tr:nth-child(1) th').text();
let name = $('table[class="wikitable sortable jquery-tablesorter"] tbody tr:nth-child(1) td:nth-child(2)').text();
// saving data
data.push({
rank,
name,
});
// exporting to csv
const j2cp = new json2csv()
const csv = j2cp.parse(data);
fs.writeFileSync("./imdb1.csv", csv, "utf-8");
}
)();
请告诉我错误在哪里。如果您想了解有关代码的一些信息,我将遵循本教程。 https://www.youtube.com/watch?v=BqGq9MTSt7g
任何帮助表示赞赏。提前致谢。
我尝试了你的代码,似乎使用的 jquery 选择器没问题,但不知何故 cheerio returns null。这是我修改后的代码,已更新以获取其他列:
var req = require("request-promise"),
cheerio = require("cheerio"),
fs = require("fs"),
json2csv = require("json2csv").Parser;
// page which i want to scrape
const wiki = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue";
(async () => {
const response = await req({
uri: wiki,
headers: {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9"
},
gzip: true,
}).then(function (html) {
let $ = cheerio.load(html);
let data = [];
let data2 = [];
let name, rank, cols, col;
let rows = $('table.wikitable tbody tr').each((idx, elem) => {
rank =$(elem).find('th').text().replace(/[\n\r]+/g,'');
//name = $(elem).find('td a').html();
data2 = [];
cols = $(elem).find('td').each((colidx, colelem) => {
col = $(colelem).text().replace(/[\n\r]+/g,'');
data2.push(col,);
});
data.push({
rank,
...data2,
});
});
// exporting to csv
const j2cp = new json2csv()
const csv = j2cp.parse(data);
fs.writeFileSync("./imdb1.csv", csv, "utf-8");
}).catch(function (err) {
console.log(err);
});
}
)();
我正在尝试抓取维基百科页面 https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue。我的目标是将 table 中的所有 50 行存储到 csv 文件。
但是我做不到。请找到附件中的代码。
// Importing necessary modules
const request = require("request-promise")
const cheerio = require("cheerio")
const fs = require("fs")
const json2csv = require("json2csv").Parser
// page which i want to scrape
const wiki = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue";
(async () => {
// this will store the data
let data = [];
// making request
const response = await request({
uri: wiki,
headers: {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9"
},
gzip: true,
});
let $ = cheerio.load(response);
// Extracting rank and name of the company, it will be only for 1st row
// I will run a loop 50 times to get 50 rows. But below two lines are not working
// on testing this line on chrome console, it is giving correct output , but it is not working
// here, same thing for name
let rank = $('table[class="wikitable sortable jquery-tablesorter"] tbody tr:nth-child(1) th').text();
let name = $('table[class="wikitable sortable jquery-tablesorter"] tbody tr:nth-child(1) td:nth-child(2)').text();
// saving data
data.push({
rank,
name,
});
// exporting to csv
const j2cp = new json2csv()
const csv = j2cp.parse(data);
fs.writeFileSync("./imdb1.csv", csv, "utf-8");
}
)();
请告诉我错误在哪里。如果您想了解有关代码的一些信息,我将遵循本教程。 https://www.youtube.com/watch?v=BqGq9MTSt7g 任何帮助表示赞赏。提前致谢。
我尝试了你的代码,似乎使用的 jquery 选择器没问题,但不知何故 cheerio returns null。这是我修改后的代码,已更新以获取其他列:
var req = require("request-promise"), cheerio = require("cheerio"), fs = require("fs"), json2csv = require("json2csv").Parser; // page which i want to scrape const wiki = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue"; (async () => { const response = await req({ uri: wiki, headers: { accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9" }, gzip: true, }).then(function (html) { let $ = cheerio.load(html); let data = []; let data2 = []; let name, rank, cols, col; let rows = $('table.wikitable tbody tr').each((idx, elem) => { rank =$(elem).find('th').text().replace(/[\n\r]+/g,''); //name = $(elem).find('td a').html(); data2 = []; cols = $(elem).find('td').each((colidx, colelem) => { col = $(colelem).text().replace(/[\n\r]+/g,''); data2.push(col,); }); data.push({ rank, ...data2, }); }); // exporting to csv const j2cp = new json2csv() const csv = j2cp.parse(data); fs.writeFileSync("./imdb1.csv", csv, "utf-8"); }).catch(function (err) { console.log(err); }); } )();