我正在尝试在 nodejs 中抓取维基百科页面

Question

我正在尝试抓取维基百科页面 https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue。我的目标是将 table 中的所有 50 行存储到 csv 文件。

但是我做不到。请找到附件中的代码。

// Importing necessary modules

const request = require("request-promise")
const cheerio = require("cheerio")
const fs      = require("fs")
const json2csv = require("json2csv").Parser

// page which i want to scrape
const wiki = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue";

(async () => {

    // this will store the data
    let data = [];

    // making request
    const response = await request({
        uri: wiki,
        headers: { 
                        accept:
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9"
        },
        gzip: true,
    });

    let $ = cheerio.load(response);


      // Extracting rank and name of the company, it will be only for 1st row
      // I will run a loop 50 times to get 50 rows. But below two lines are not working

     // on testing this line on chrome console, it is giving correct output , but it is not working   
     // here, same thing for name
        let rank = $('table[class="wikitable sortable jquery-tablesorter"] tbody tr:nth-child(1) th').text();

        let name = $('table[class="wikitable sortable jquery-tablesorter"] tbody tr:nth-child(1) td:nth-child(2)').text();

     // saving data
        data.push({
            rank,
            name,
        });

    // exporting to csv
    const j2cp = new json2csv()
    const csv = j2cp.parse(data);

    fs.writeFileSync("./imdb1.csv", csv, "utf-8");
}
)();

请告诉我错误在哪里。如果您想了解有关代码的一些信息，我将遵循本教程。 https://www.youtube.com/watch?v=BqGq9MTSt7g 任何帮助表示赞赏。提前致谢。

Answer 1

我尝试了你的代码，似乎使用的 jquery 选择器没问题，但不知何故 cheerio returns null。这是我修改后的代码，已更新以获取其他列：


    var req = require("request-promise"),
        cheerio = require("cheerio"),
        fs = require("fs"),
        json2csv = require("json2csv").Parser;

    // page which i want to scrape
    const wiki = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue";

    (async () => {
        const response = await req({
            uri: wiki,
            headers: {
                accept:
                    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "en-US,en;q=0.9"
            },
            gzip: true,
        }).then(function (html) {
            let $ = cheerio.load(html);
            let data = [];
            let data2 = [];
            let name, rank, cols, col;
            let rows = $('table.wikitable tbody tr').each((idx, elem) => {
                rank =$(elem).find('th').text().replace(/[\n\r]+/g,'');
                //name = $(elem).find('td a').html();
                data2 = [];
                cols = $(elem).find('td').each((colidx, colelem) => {
                    col = $(colelem).text().replace(/[\n\r]+/g,'');
                    data2.push(col,);    
                });

                data.push({
                    rank,
                    ...data2,
                });
            });

            // exporting to csv
            const j2cp = new json2csv()
            const csv = j2cp.parse(data);

            fs.writeFileSync("./imdb1.csv", csv, "utf-8");
        }).catch(function (err) {
            console.log(err);
        });

    }
    )();

我正在尝试在 nodejs 中抓取维基百科页面

I am trying to scrape a wikipedia page in nodejs

javascript

wikipedia

node.js

web-scraping

cheerio