使用节点和 cheerio.js 抓取具有 rowspan 属性的表 HTML
Scraping HTML Tables with rowspan attributes using node and cheerio.js
我正在尝试从维基百科中抓取 HTML table 数据,但是由于 rowspan 属性,我在遍历 table 时遇到了很多麻烦。我正在使用 cheerio 包,但我迷失在 cheerio returns 的 jQuery 个对象中。任何帮助将不胜感激,几天来我一直在努力反对这一点。您在下面看到的只是我尝试使用的代码的一小部分。谢谢
Table URL = "https://en.wikipedia.org/wiki/Stephen_King_bibliography"
const axios = require("axios");
const $ = require("cheerio");
const WIKI_URL = "https://en.wikipedia.org/wiki/Stephen_King_bibliography";
const getBooks = async (url) => {
const scrapedBooks = [];
try {
const res = await axios.get(url);
const htmlParse = $(
"#mw-content-text > div.mw-parser-output > table:nth-child(6) > tbody > tr",
res.data
);
// let filtData = [];
// for (let i = 0; i < htmlParse.length; i++) {
// filtData.push(
// htmlParse[i].children.filter((child) => child.data !== "\n")
// );
// }
// htmlParse[i].children[5].attribs["rowspan"]
//Getting the Row headers
let headers = [];
htmlParse.each((index, el) => {
headers.push(
$(el)
.find("th")
.text()
.split("\n")
.filter((item) => item.length)
);
});
headers = headers[0];
let data = [];
htmlParse.each((index, el) => {
let item = $(el).find("td");
item.attr("rowspan") ? data.push(item.get()) : data.push("NOT FOUND");
});
console.log(data);
} catch (err) {
console.log(err);
}
};
getBooks(WIKI_URL);
之前从未尝试过网络 scraping,但我一直想进入它,所以我试了一下这个问题。
据我了解,您的问题是 rowspan 丢弃了 table 中每行的单元格数量,这导致某些行的数据少于一致的 6 项。
由于我采用了一种全新的方法,因此代码有所不同,但它确实适用于您的 url,使用相同的程序包,并解决了您的具体问题。请参阅下面的代码以获取工作版本。我添加了自己的评论以帮助您了解正在发生的事情。
const cheerio = require('cheerio');
// parses HTML
var $ = cheerio.load(res.data);
// targets the specific with selectors
var html_table = $('#mw-content-text > div.mw-parser-output > table:nth-child(6)');
// gets table header titles; loops through all th data, and pulls an array of the values
var table_header = html_table.find('th').map(function() {return $(this).text().trim();}).toArray();
// gets all rowspans inside the table; loops through all tr rows
var rowspans = html_table.find('tbody tr').map(function(tr_index) {
// gets all rowspans for the current row; loops through all td cells of the current tr row
var tr_rowspans = $(this).find('td').map(function(td_index) {
// shotern the reference to the rowspan value of the current td cell since it's used more than once
var rowspan_count = $(this).attr('rowspan');
// build an object with the index of the current tr, td, the rowspan value, and the the td value
return {'tr_index':tr_index, 'td_index':rowspan_count ? td_index : 0, 'count':Number(rowspan_count) || 0, 'value':$(this).text().trim()};
}).toArray().filter(function(item) {return item.count;});
// the filter above ^ removes undefined items form the array
// returns the rowspans for the current row
return tr_rowspans;
}).toArray();
// gets table cell values; loops through all tr rows
var table_data = html_table.find('tbody tr').map(function(tr_index) {
// gets the cells value for the row; loops through each cell and returns an array of values
// note: nothing special happens to the cells at this point
var cells = $(this).find('td').map(function(td_index) {return $(this).text().trim();}).toArray();
// adds missing cells to each row of data; loops through each rowspan collected
rowspans.forEach(function(rowspan) {
// defines the min and max rows; uses tr index to get the first index, and tr index + rowspan value to get the last index
var span = {'min':rowspan.tr_index, 'max':rowspan.tr_index + rowspan.count};
// if the current row is within the min and max
if (tr_index > span.min && tr_index < span.max) {
// add an array element where the missing cell data should be and add the cell value at the same time
cells.splice(rowspan.td_index, 0, rowspan.value);
}
});
// returns an array of the cell data generated
return [cells];
}).toArray();
// output the table headers
console.log('table_header', table_header);
// output the table data
console.log('table_data', table_data);
我正在尝试从维基百科中抓取 HTML table 数据,但是由于 rowspan 属性,我在遍历 table 时遇到了很多麻烦。我正在使用 cheerio 包,但我迷失在 cheerio returns 的 jQuery 个对象中。任何帮助将不胜感激,几天来我一直在努力反对这一点。您在下面看到的只是我尝试使用的代码的一小部分。谢谢
Table URL = "https://en.wikipedia.org/wiki/Stephen_King_bibliography"
const axios = require("axios");
const $ = require("cheerio");
const WIKI_URL = "https://en.wikipedia.org/wiki/Stephen_King_bibliography";
const getBooks = async (url) => {
const scrapedBooks = [];
try {
const res = await axios.get(url);
const htmlParse = $(
"#mw-content-text > div.mw-parser-output > table:nth-child(6) > tbody > tr",
res.data
);
// let filtData = [];
// for (let i = 0; i < htmlParse.length; i++) {
// filtData.push(
// htmlParse[i].children.filter((child) => child.data !== "\n")
// );
// }
// htmlParse[i].children[5].attribs["rowspan"]
//Getting the Row headers
let headers = [];
htmlParse.each((index, el) => {
headers.push(
$(el)
.find("th")
.text()
.split("\n")
.filter((item) => item.length)
);
});
headers = headers[0];
let data = [];
htmlParse.each((index, el) => {
let item = $(el).find("td");
item.attr("rowspan") ? data.push(item.get()) : data.push("NOT FOUND");
});
console.log(data);
} catch (err) {
console.log(err);
}
};
getBooks(WIKI_URL);
之前从未尝试过网络 scraping,但我一直想进入它,所以我试了一下这个问题。
据我了解,您的问题是 rowspan 丢弃了 table 中每行的单元格数量,这导致某些行的数据少于一致的 6 项。
由于我采用了一种全新的方法,因此代码有所不同,但它确实适用于您的 url,使用相同的程序包,并解决了您的具体问题。请参阅下面的代码以获取工作版本。我添加了自己的评论以帮助您了解正在发生的事情。
const cheerio = require('cheerio');
// parses HTML
var $ = cheerio.load(res.data);
// targets the specific with selectors
var html_table = $('#mw-content-text > div.mw-parser-output > table:nth-child(6)');
// gets table header titles; loops through all th data, and pulls an array of the values
var table_header = html_table.find('th').map(function() {return $(this).text().trim();}).toArray();
// gets all rowspans inside the table; loops through all tr rows
var rowspans = html_table.find('tbody tr').map(function(tr_index) {
// gets all rowspans for the current row; loops through all td cells of the current tr row
var tr_rowspans = $(this).find('td').map(function(td_index) {
// shotern the reference to the rowspan value of the current td cell since it's used more than once
var rowspan_count = $(this).attr('rowspan');
// build an object with the index of the current tr, td, the rowspan value, and the the td value
return {'tr_index':tr_index, 'td_index':rowspan_count ? td_index : 0, 'count':Number(rowspan_count) || 0, 'value':$(this).text().trim()};
}).toArray().filter(function(item) {return item.count;});
// the filter above ^ removes undefined items form the array
// returns the rowspans for the current row
return tr_rowspans;
}).toArray();
// gets table cell values; loops through all tr rows
var table_data = html_table.find('tbody tr').map(function(tr_index) {
// gets the cells value for the row; loops through each cell and returns an array of values
// note: nothing special happens to the cells at this point
var cells = $(this).find('td').map(function(td_index) {return $(this).text().trim();}).toArray();
// adds missing cells to each row of data; loops through each rowspan collected
rowspans.forEach(function(rowspan) {
// defines the min and max rows; uses tr index to get the first index, and tr index + rowspan value to get the last index
var span = {'min':rowspan.tr_index, 'max':rowspan.tr_index + rowspan.count};
// if the current row is within the min and max
if (tr_index > span.min && tr_index < span.max) {
// add an array element where the missing cell data should be and add the cell value at the same time
cells.splice(rowspan.td_index, 0, rowspan.value);
}
});
// returns an array of the cell data generated
return [cells];
}).toArray();
// output the table headers
console.log('table_header', table_header);
// output the table data
console.log('table_data', table_data);