有没有办法 capture/scrape 整个 table 而不是使用 Cheerio 库逐个单元格？

Question

我的 scraper 函数在 O(n^2) 中运行，执行需要 6 秒，我正在寻找优化方法。

我正在抓取的源站点是 www.rate.am/en。下面的屏幕截图

抓取函数

const rp = require("request-promise");
const $ = require("cheerio");
const url = "http://rate.am/en/armenian-dram-exchange-rates/banks/non-cash";

const TABLE_ROW_IDS = [
  "69460818-02ec-456e-8d09-8eeff6494bce",
  "0fffdcc4-8e36-49f3-9863-93ad02ce6541",
  "65351947-217c-4593-9011-941b88ee7baf",
  "8e9bd4c8-6f4a-4663-ae86-b8fbaf295030",
  "ebd241ce-4a38-45a4-9bcd-c6e607079706",
  "466fe84c-197f-4174-bc97-e1dc7960edc7",
  "5ee70183-87fe-4799-802e-ef7f5e7323db",
  "f3ffb6cf-dbb6-4d43-b49c-f6d71350d7fb",
  "b5bb13d2-8a79-43a8-a538-ffd1e2e21009",
  "db08ff22-add9-45ea-a450-1fe5b1993704",
  "2119a3f1-b233-4254-a450-304a2a5bff19",
  "989ba942-a5cf-4fc2-b62e-3248c4edfbbc",
  "e1a68c2e-bc47-4f58-afd2-3b80a8465b14",
  "332c7078-97ad-4bf7-b8ee-44d85a9c88d1",
  "133240fd-5910-421d-b417-5a9cedd5f5f7"
];

const rateScraper = () => {
  return (
    rp(url)
      .then(url => {
        let resultsArr = [];
        for (let i = 0; i < TABLE_ROW_IDS.length; i++) {
          let currencyArr = [];
          let bankName = $(
            `tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(2) > a`,
            url
          ).text();

          for (let j = 6; j <= 13; j++) {
            currencyArr.push(
              $(
                `tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(${j})`,
                url
              ).text()
            );
          }
          resultsArr.push({
            bankName,
            usd: { buy: currencyArr[0], sell: currencyArr[1] },
            eur: { buy: currencyArr[2], sell: currencyArr[3] },
            rub: { buy: currencyArr[4], sell: currencyArr[5] },
            gbp: { buy: currencyArr[6], sell: currencyArr[7] }
          });
        }
        return resultsArr;
      })
      .catch(error => {
        console.log(error);
      })
  );
};
module.exports = rateScraper;

TABLE_ROW_IDS 是每行唯一的常量。第一个循环遍历每一行，第二个循环提取每个单元格中的各个比率。

我怀疑几十个 Cheerio 调用很昂贵，我正在寻找一种方法来一次提取整个 table 来解析。

编辑：优化代码

const rateScraper = () => {
  let start = Date.now();

  return got(url)
    .then((response) => {
      let resultsArr = [];
      const $ = cheerio.load(response.body);
      $("#rb > tbody > tr")
        .filter((i, el) => TABLE_ROW_IDS.includes(el.attribs.id))
        .each((i, elem) => {
          const cells = $(elem).find("td");
          resultsArr.push({
            bankName: cells.eq(1).text(),
            usd: { buy: cells.eq(5).text(), sell: cells.eq(6).text() },
            eur: { buy: cells.eq(7).text(), sell: cells.eq(8).text() },
            rub: { buy: cells.eq(9).text(), sell: cells.eq(10).text() },
            gbp: { buy: cells.eq(11).text(), sell: cells.eq(12).text() },
          });
        });
      let end = Date.now();
      console.log("time", end - start);
      return resultsArr;
    })
    .catch((error) => {
      console.log(error);
    });
};
module.exports = rateScraper;

Answer 1

所以在我看来，最主要的是你不想让它遍历整个文档来找到每一个东西，你想要找到行，然后只遍历行来获取每个单元格。目前每次你这样做

$(
    `tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(${j})`,
    url
).text()

你必须遍历整个事情。

在页面上的纯 JS 中，您可以这样做：

Array.from(
    document.getElementById('rb').querySelectorAll('tr')
).filter((a) => TABLE_ROW_IDS.includes(a.id))
.map((row) => {
    const cells = row.querySelectorAll('td');
    return { 
        bankName: cells[1].innerText, 
        usd: { buy: cells[5].innerText, sell: cells[6].innerText },
        eur: { buy: cells[7].innerText, sell: cells[8].innerText },
        rub: { buy: cells[9].innerText, sell: cells[10].innerText },
        gbp: { buy: cells[11].innerText, sell: cells[12].innerText }
    };
});

所以翻译成cheerio？我猜它会是这样的：

.then(url => {
    const results = [];
    $('#rd > tr', url).filter(() => TABLE_ROW_IDS.includes(this.id)).each((i, elem) => {
        const cells = $(this).find('td');
        results.push({ 
            bankName: cells.eq(1).text(), 
            usd: { buy: cells.eq(5).text(), sell: cells.eq(6).text() },
            eur: { buy: cells.eq(7).text(), sell: cells.eq(8).text() },
            rub: { buy: cells.eq(9).text(), sell: cells.eq(10).text() },
            gbp: { buy: cells.eq(11).text(), sell: cells.eq(12).text() }
        });
    });

有没有办法 capture/scrape 整个 table 而不是使用 Cheerio 库逐个单元格？

Is there a way to capture/scrape an entire table instead of cell by cell with the Cheerio library?

node.js

web-scraping

cheerio