有没有办法 capture/scrape 整个 table 而不是使用 Cheerio 库逐个单元格?
Is there a way to capture/scrape an entire table instead of cell by cell with the Cheerio library?
我的 scraper 函数在 O(n^2) 中运行,执行需要 6 秒,我正在寻找优化方法。
我正在 抓取 的源站点是 www.rate.am/en。下面的屏幕截图
抓取函数
const rp = require("request-promise");
const $ = require("cheerio");
const url = "http://rate.am/en/armenian-dram-exchange-rates/banks/non-cash";
const TABLE_ROW_IDS = [
"69460818-02ec-456e-8d09-8eeff6494bce",
"0fffdcc4-8e36-49f3-9863-93ad02ce6541",
"65351947-217c-4593-9011-941b88ee7baf",
"8e9bd4c8-6f4a-4663-ae86-b8fbaf295030",
"ebd241ce-4a38-45a4-9bcd-c6e607079706",
"466fe84c-197f-4174-bc97-e1dc7960edc7",
"5ee70183-87fe-4799-802e-ef7f5e7323db",
"f3ffb6cf-dbb6-4d43-b49c-f6d71350d7fb",
"b5bb13d2-8a79-43a8-a538-ffd1e2e21009",
"db08ff22-add9-45ea-a450-1fe5b1993704",
"2119a3f1-b233-4254-a450-304a2a5bff19",
"989ba942-a5cf-4fc2-b62e-3248c4edfbbc",
"e1a68c2e-bc47-4f58-afd2-3b80a8465b14",
"332c7078-97ad-4bf7-b8ee-44d85a9c88d1",
"133240fd-5910-421d-b417-5a9cedd5f5f7"
];
const rateScraper = () => {
return (
rp(url)
.then(url => {
let resultsArr = [];
for (let i = 0; i < TABLE_ROW_IDS.length; i++) {
let currencyArr = [];
let bankName = $(
`tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(2) > a`,
url
).text();
for (let j = 6; j <= 13; j++) {
currencyArr.push(
$(
`tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(${j})`,
url
).text()
);
}
resultsArr.push({
bankName,
usd: { buy: currencyArr[0], sell: currencyArr[1] },
eur: { buy: currencyArr[2], sell: currencyArr[3] },
rub: { buy: currencyArr[4], sell: currencyArr[5] },
gbp: { buy: currencyArr[6], sell: currencyArr[7] }
});
}
return resultsArr;
})
.catch(error => {
console.log(error);
})
);
};
module.exports = rateScraper;
TABLE_ROW_IDS
是每行唯一的常量。第一个循环遍历每一行,第二个循环提取每个单元格中的各个比率。
我怀疑几十个 Cheerio 调用很昂贵,我正在寻找一种方法来一次提取整个 table 来解析。
编辑:优化代码
const rateScraper = () => {
let start = Date.now();
return got(url)
.then((response) => {
let resultsArr = [];
const $ = cheerio.load(response.body);
$("#rb > tbody > tr")
.filter((i, el) => TABLE_ROW_IDS.includes(el.attribs.id))
.each((i, elem) => {
const cells = $(elem).find("td");
resultsArr.push({
bankName: cells.eq(1).text(),
usd: { buy: cells.eq(5).text(), sell: cells.eq(6).text() },
eur: { buy: cells.eq(7).text(), sell: cells.eq(8).text() },
rub: { buy: cells.eq(9).text(), sell: cells.eq(10).text() },
gbp: { buy: cells.eq(11).text(), sell: cells.eq(12).text() },
});
});
let end = Date.now();
console.log("time", end - start);
return resultsArr;
})
.catch((error) => {
console.log(error);
});
};
module.exports = rateScraper;
所以在我看来,最主要的是你不想让它遍历整个文档来找到每一个东西,你想要找到行,然后只遍历行来获取每个单元格。目前每次你这样做
$(
`tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(${j})`,
url
).text()
你必须遍历整个事情。
在页面上的纯 JS 中,您可以这样做:
Array.from(
document.getElementById('rb').querySelectorAll('tr')
).filter((a) => TABLE_ROW_IDS.includes(a.id))
.map((row) => {
const cells = row.querySelectorAll('td');
return {
bankName: cells[1].innerText,
usd: { buy: cells[5].innerText, sell: cells[6].innerText },
eur: { buy: cells[7].innerText, sell: cells[8].innerText },
rub: { buy: cells[9].innerText, sell: cells[10].innerText },
gbp: { buy: cells[11].innerText, sell: cells[12].innerText }
};
});
所以翻译成cheerio?我猜它会是这样的:
.then(url => {
const results = [];
$('#rd > tr', url).filter(() => TABLE_ROW_IDS.includes(this.id)).each((i, elem) => {
const cells = $(this).find('td');
results.push({
bankName: cells.eq(1).text(),
usd: { buy: cells.eq(5).text(), sell: cells.eq(6).text() },
eur: { buy: cells.eq(7).text(), sell: cells.eq(8).text() },
rub: { buy: cells.eq(9).text(), sell: cells.eq(10).text() },
gbp: { buy: cells.eq(11).text(), sell: cells.eq(12).text() }
});
});
我的 scraper 函数在 O(n^2) 中运行,执行需要 6 秒,我正在寻找优化方法。
我正在 抓取 的源站点是 www.rate.am/en。下面的屏幕截图
抓取函数
const rp = require("request-promise");
const $ = require("cheerio");
const url = "http://rate.am/en/armenian-dram-exchange-rates/banks/non-cash";
const TABLE_ROW_IDS = [
"69460818-02ec-456e-8d09-8eeff6494bce",
"0fffdcc4-8e36-49f3-9863-93ad02ce6541",
"65351947-217c-4593-9011-941b88ee7baf",
"8e9bd4c8-6f4a-4663-ae86-b8fbaf295030",
"ebd241ce-4a38-45a4-9bcd-c6e607079706",
"466fe84c-197f-4174-bc97-e1dc7960edc7",
"5ee70183-87fe-4799-802e-ef7f5e7323db",
"f3ffb6cf-dbb6-4d43-b49c-f6d71350d7fb",
"b5bb13d2-8a79-43a8-a538-ffd1e2e21009",
"db08ff22-add9-45ea-a450-1fe5b1993704",
"2119a3f1-b233-4254-a450-304a2a5bff19",
"989ba942-a5cf-4fc2-b62e-3248c4edfbbc",
"e1a68c2e-bc47-4f58-afd2-3b80a8465b14",
"332c7078-97ad-4bf7-b8ee-44d85a9c88d1",
"133240fd-5910-421d-b417-5a9cedd5f5f7"
];
const rateScraper = () => {
return (
rp(url)
.then(url => {
let resultsArr = [];
for (let i = 0; i < TABLE_ROW_IDS.length; i++) {
let currencyArr = [];
let bankName = $(
`tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(2) > a`,
url
).text();
for (let j = 6; j <= 13; j++) {
currencyArr.push(
$(
`tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(${j})`,
url
).text()
);
}
resultsArr.push({
bankName,
usd: { buy: currencyArr[0], sell: currencyArr[1] },
eur: { buy: currencyArr[2], sell: currencyArr[3] },
rub: { buy: currencyArr[4], sell: currencyArr[5] },
gbp: { buy: currencyArr[6], sell: currencyArr[7] }
});
}
return resultsArr;
})
.catch(error => {
console.log(error);
})
);
};
module.exports = rateScraper;
TABLE_ROW_IDS
是每行唯一的常量。第一个循环遍历每一行,第二个循环提取每个单元格中的各个比率。
我怀疑几十个 Cheerio 调用很昂贵,我正在寻找一种方法来一次提取整个 table 来解析。
编辑:优化代码
const rateScraper = () => {
let start = Date.now();
return got(url)
.then((response) => {
let resultsArr = [];
const $ = cheerio.load(response.body);
$("#rb > tbody > tr")
.filter((i, el) => TABLE_ROW_IDS.includes(el.attribs.id))
.each((i, elem) => {
const cells = $(elem).find("td");
resultsArr.push({
bankName: cells.eq(1).text(),
usd: { buy: cells.eq(5).text(), sell: cells.eq(6).text() },
eur: { buy: cells.eq(7).text(), sell: cells.eq(8).text() },
rub: { buy: cells.eq(9).text(), sell: cells.eq(10).text() },
gbp: { buy: cells.eq(11).text(), sell: cells.eq(12).text() },
});
});
let end = Date.now();
console.log("time", end - start);
return resultsArr;
})
.catch((error) => {
console.log(error);
});
};
module.exports = rateScraper;
所以在我看来,最主要的是你不想让它遍历整个文档来找到每一个东西,你想要找到行,然后只遍历行来获取每个单元格。目前每次你这样做
$(
`tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(${j})`,
url
).text()
你必须遍历整个事情。
在页面上的纯 JS 中,您可以这样做:
Array.from(
document.getElementById('rb').querySelectorAll('tr')
).filter((a) => TABLE_ROW_IDS.includes(a.id))
.map((row) => {
const cells = row.querySelectorAll('td');
return {
bankName: cells[1].innerText,
usd: { buy: cells[5].innerText, sell: cells[6].innerText },
eur: { buy: cells[7].innerText, sell: cells[8].innerText },
rub: { buy: cells[9].innerText, sell: cells[10].innerText },
gbp: { buy: cells[11].innerText, sell: cells[12].innerText }
};
});
所以翻译成cheerio?我猜它会是这样的:
.then(url => {
const results = [];
$('#rd > tr', url).filter(() => TABLE_ROW_IDS.includes(this.id)).each((i, elem) => {
const cells = $(this).find('td');
results.push({
bankName: cells.eq(1).text(),
usd: { buy: cells.eq(5).text(), sell: cells.eq(6).text() },
eur: { buy: cells.eq(7).text(), sell: cells.eq(8).text() },
rub: { buy: cells.eq(9).text(), sell: cells.eq(10).text() },
gbp: { buy: cells.eq(11).text(), sell: cells.eq(12).text() }
});
});