使用节点和 cheerio.js 抓取具有 rowspan 属性的表 HTML

Scraping HTML Tables with rowspan attributes using node and cheerio.js

我正在尝试从维基百科中抓取 HTML table 数据,但是由于 rowspan 属性,我在遍历 table 时遇到了很多麻烦。我正在使用 cheerio 包,但我迷失在 cheerio returns 的 jQuery 个对象中。任何帮助将不胜感激,几天来我一直在努力反对这一点。您在下面看到的只是我尝试使用的代码的一小部分。谢谢

Table URL = "https://en.wikipedia.org/wiki/Stephen_King_bibliography"


const axios = require("axios");
const $ = require("cheerio");

const WIKI_URL = "https://en.wikipedia.org/wiki/Stephen_King_bibliography";

const getBooks = async (url) => {
  const scrapedBooks = [];
  try {
    const res = await axios.get(url);
    const htmlParse = $(
      "#mw-content-text > div.mw-parser-output > table:nth-child(6) > tbody > tr",
      res.data
    );
    // let filtData = [];
    // for (let i = 0; i < htmlParse.length; i++) {
    //   filtData.push(
    //     htmlParse[i].children.filter((child) => child.data !== "\n")
    //   );
    // }
    // htmlParse[i].children[5].attribs["rowspan"]

//Getting the Row headers
    let headers = [];
    htmlParse.each((index, el) => {
      headers.push(
        $(el)
          .find("th")
          .text()
          .split("\n")
          .filter((item) => item.length)
      );
    });
    headers = headers[0];

    let data = [];

    htmlParse.each((index, el) => {
      let item = $(el).find("td");
      item.attr("rowspan") ? data.push(item.get()) : data.push("NOT FOUND");
    });

    console.log(data);

  } catch (err) {
    console.log(err);
  }
};

getBooks(WIKI_URL);


之前从未尝试过网络 scraping,但我一直想进入它,所以我试了一下这个问题。

据我了解,您的问题是 rowspan 丢弃了 table 中每行的单元格数量,这导致某些行的数据少于一致的 6 项。

由于我采用了一种全新的方法,因此代码有所不同,但它确实适用于您的 url,使用相同的程序包,并解决了您的具体问题。请参阅下面的代码以获取工作版本。我添加了自己的评论以帮助您了解正在发生的事情。

const cheerio = require('cheerio');
// parses HTML
var $ = cheerio.load(res.data);
// targets the specific with selectors
var html_table = $('#mw-content-text > div.mw-parser-output > table:nth-child(6)');
// gets table header titles; loops through all th data, and pulls an array of the values
var table_header = html_table.find('th').map(function() {return $(this).text().trim();}).toArray();
// gets all rowspans inside the table; loops through all tr rows
var rowspans = html_table.find('tbody tr').map(function(tr_index) {
  // gets all rowspans for the current row; loops through all td cells of the current tr row
  var tr_rowspans = $(this).find('td').map(function(td_index) {
    // shotern the reference to the rowspan value of the current td cell since it's used more than once
    var rowspan_count = $(this).attr('rowspan');
    // build an object with the index of the current tr, td, the rowspan value, and the the td value
    return {'tr_index':tr_index, 'td_index':rowspan_count ? td_index : 0, 'count':Number(rowspan_count) || 0, 'value':$(this).text().trim()};
  }).toArray().filter(function(item) {return item.count;});
  // the filter above ^ removes undefined items form the array
  // returns the rowspans for the current row
  return tr_rowspans;
}).toArray();
// gets table cell values; loops through all tr rows
var table_data = html_table.find('tbody tr').map(function(tr_index) {
  // gets the cells value for the row; loops through each cell and returns an array of values
  // note: nothing special happens to the cells at this point
  var cells = $(this).find('td').map(function(td_index) {return $(this).text().trim();}).toArray();
  // adds missing cells to each row of data; loops through each rowspan collected
  rowspans.forEach(function(rowspan) {
    // defines the min and max rows; uses tr index to get the first index, and tr index + rowspan value to get the last index
    var span = {'min':rowspan.tr_index, 'max':rowspan.tr_index + rowspan.count};
    // if the current row is within the min and max
    if (tr_index > span.min && tr_index < span.max) {
      // add an array element where the missing cell data should be and add the cell value at the same time
      cells.splice(rowspan.td_index, 0, rowspan.value);
    }
  });
  // returns an array of the cell data generated
  return [cells];
}).toArray();
// output the table headers
console.log('table_header', table_header);
// output the table data
console.log('table_data', table_data);