Error: The resource in the URL content has exceeded the maximum size. - Is it possible to reproduce IMPORTXML by script to avoid problems like this?

Question

=IF(ARRAYFORMULA(JOIN("-",TRIM(IMPORTXML("http://old.statarea.com/","//tr[3]/th[2]/b | //tr[3]/th[3]/b | //tr[3]/th[7]/b | //tr[3]/th[8]/b | //tr[3]/th[9]/b | //tr[3]/th[16]/b"))))="Host-Guest-1-X-2-2.5",
{IMPORTXML(IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href"),"//tr/td[2]/a"),
IMPORTXML(IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href"),"//tr/td[3]/a"),
ARRAYFORMULA(VALUE(TEXT(1/QUERY(SUBSTITUTE(IMPORTXML(IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href"),"//tr/td[7]"),"HX",""),"Where Col1 is not null"),"0.00"))),
ARRAYFORMULA(VALUE(TEXT(1/QUERY(SUBSTITUTE(IMPORTXML(IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href"),"//tr/td[8]"),"H2",""),"Where Col1 is not null"),"0.00"))),
ARRAYFORMULA(VALUE(TEXT(1/QUERY(SUBSTITUTE(IMPORTXML(IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href"),"//tr/td[9]"),"HX",""),"Where Col1 is not null"),"0.00"))),
ARRAYFORMULA(VALUE(TEXT(1/QUERY(SUBSTITUTE(IMPORTXML(IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href"),"//tr/td[16]"),"hc2",""),"Where Col1 is not null"),"0.00")))},
"Off")

即使在今天早些时候，它也能完美运行，显示所有数据。但是从现在开始，同样的错误总是出现，即使我尝试导入一些非常简单的东西，比如：

=IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href")

我想知道是否可以通过 Google App Script (GAS) 重现这个（我放在问题开头的完整导入）导入，如果有人可以告诉我如何看起来，这样就不会再有极限问题了

Answer 1

您想将问题中的公式转换为 Google Apps 脚本。
基数 URL 是 http://old.statarea.com/。
您想通过将数据 URL 更改为 IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href") 和 IMPORTXML("http://old.statarea.com/","//tr/td/a[5]/@href") 来检索值。
作为值，您想要检索 td[7]、td[8]、td[9] 和 td[16]。
例如，您要将 50% 的检索值计算为 1 / 0.5，并希望将其用作放入电子表格的值。

如果我的理解是正确的，这个答案怎么样？请将此视为几个可能的答案之一。

流量：

在这种情况下，我通过 3 步检索您想要的值。

使用 Google Apps 脚本库解析器从 HTML 中检索部分值。
通过删除不必要的值，使用 XmlService 解析检索到的 HTML。
使用 XmlService 检索结果值。

用法：

1。安装 "Parser"

请安装 "Parser" 的 Google Apps 脚本库。

2。示例脚本 1：

这是一个示例脚本。在此脚本中，您可以将其用作自定义函数。因此，请将 =sample(5) 的公式放入单元格。

function sample(placeOfUrl) {
  // Retrieve URL.
  var baseUrl = "http://old.statarea.com/";
  var res1 = UrlFetchApp.fetch(baseUrl);
  if (res1.getResponseCode() != 200) throw new Erro("URL cannot be used.");
  const from = '<td style="padding-top: 10px; text-align: center;">';
  const to = '&nbsp;&nbsp;</td>';
  const htmlData1 = (from + Parser.data(res1.getContentText()).from(from).to(to).build() + to).replace(/\&nbsp;/g, "");
  const xmlRoot = XmlService.parse(htmlData1).getRootElement();
  const c = xmlRoot.getChildren()[placeOfUrl - 1];
  if (!c) return;
  const url = c.getAttribute("href").getValue();

  // Parse HTML data.
  const res2 = UrlFetchApp.fetch(url);
  if (res2.getResponseCode() != 200) throw new Erro("URL for retrieving data cannot be used.");
  const htmlData2 = res2.getContentText();
  const parsedData1 = Parser.data(htmlData2).from('<table class="style_1" cellspacing="0" cellpadding="0" width="918" border="0">').to('</table>').build();
  const parsedData2 = Parser.data(parsedData1).from("<tr>").to("</tr>").iterate();
  const data = parsedData2
    .filter(function(e) {return /^<td width="35" align="center">/.test(e)})
    .map(function(e) {return "<content>" + e.match(/<td.+?\/td>/g).map(function(f) {return f.replace(/\&nbsp\;|<div.+?>|<\/div>|<img.+?>|<input.+?>|\&team_guest|<\/h.+?>|\&/g, "")}).join("") + "</content>"})
    .join("");
  const xmlRootContent = XmlService.parse("<root>" + data + "</root>").getRootElement();

  // Retrieve result values.
  const content = xmlRootContent.getChildren();
  const values = content.reduce((ar1, e) => {
    const temp = e.getChildren().reduce((ar2, f, j) => {
      if (f) {
        if (f.getChild("a")) {
          const t = f.getChild("a").getValue()
          if (t) ar2.push(t);
        } else {
          if (f.getAttribute("style")) {
            const v = f.getValue();
            if (v && [6, 7, 8, 15].includes(j)) {
              ar2.push(Math.round((1 / (parseInt(v, 10) / 100)) * 100) / 100);
            }
          }
        }
      }
      return ar2;
    }, []);
    ar1.push(temp);
    return ar1;
  }, []);

  return values;
}

设置=sample(4)时，数据URL与IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href")相同。
设置=sample(5)时，数据URL与IMPORTXML("http://old.statarea.com/","//tr/td/a[5]/@href")相同。

结果：

3。示例脚本 2：

请将以下脚本复制并粘贴到脚本编辑器中。在这种情况下，使用容器绑定脚本。当您在脚本编辑器中运行脚本时，值将被放入电子表格。

function myFunction() {
  var placeOfUrl = "5";  // Here, you can change the URL for retrieving values.

  // Retrieve URL.
  var baseUrl = "http://old.statarea.com/";
  var res1 = UrlFetchApp.fetch(baseUrl);
  if (res1.getResponseCode() != 200) throw new Erro("URL cannot be used.");
  const from = '<td style="padding-top: 10px; text-align: center;">';
  const to = '&nbsp;&nbsp;</td>';
  const htmlData1 = (from + Parser.data(res1.getContentText()).from(from).to(to).build() + to).replace(/\&nbsp;/g, "");
  const xmlRoot = XmlService.parse(htmlData1).getRootElement();
  const c = xmlRoot.getChildren()[placeOfUrl - 1];
  if (!c) return;
  const url = c.getAttribute("href").getValue();

  // Parse HTML data.
  const res2 = UrlFetchApp.fetch(url);
  if (res2.getResponseCode() != 200) throw new Erro("URL for retrieving data cannot be used.");
  const htmlData2 = res2.getContentText();
  const parsedData1 = Parser.data(htmlData2).from('<table class="style_1" cellspacing="0" cellpadding="0" width="918" border="0">').to('</table>').build();
  const parsedData2 = Parser.data(parsedData1).from("<tr>").to("</tr>").iterate();
  const data = parsedData2
    .filter(function(e) {return /^<td width="35" align="center">/.test(e)})
    .map(function(e) {return "<content>" + e.match(/<td.+?\/td>/g).map(function(f) {return f.replace(/\&nbsp\;|<div.+?>|<\/div>|<img.+?>|<input.+?>|\&team_guest|<\/h.+?>|\&/g, "")}).join("") + "</content>"})
    .join("");
  const xmlRootContent = XmlService.parse("<root>" + data + "</root>").getRootElement();

  // Retrieve result values.
  const content = xmlRootContent.getChildren();
  const values = content.reduce((ar1, e) => {
    const temp = e.getChildren().reduce((ar2, f, j) => {
      if (f) {
        if (f.getChild("a")) {
          const t = f.getChild("a").getValue()
          if (t) ar2.push(t);
        } else {
          if (f.getAttribute("style")) {
            const v = f.getValue();
            if (v && [6, 7, 8, 15].includes(j)) {
              ar2.push(Math.round((1 / (parseInt(v, 10) / 100)) * 100) / 100);
            }
          }
        }
      }
      return ar2;
    }, []);
    ar1.push(temp);
    return ar1;
  }, []);

  // Put values to Spreadsheet.
  var sheetname = "Sheet5";
  var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName(sheetname);
  sheet.getRange(sheet.getLastRow() + 1, 1, values.length, values[0].length).setValues(values);
}

设置var placeOfUrl = "4"时，数据URL与IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href")相同。
设置var placeOfUrl = "5"时，数据URL与IMPORTXML("http://old.statarea.com/","//tr/td/a[5]/@href")相同。

注：

我确认在您共享的电子表格中的 GAS 项目中使用了 V8。所以上面的脚本也使用了 V8。请注意这一点。
当来自“http://old.statarea.com/" is near 1 MB, your formula can be used. But when the size of HTML data from "http://old.statarea.com/”的HTML数据的大小接近2MB时，会出现错误。这在你的问题中已经提到了。
- 在这种情况下，似乎 URL 已更改。当来自“http://old.statarea.com/" is near 1 MB, var placeOfUrl = "4" is the same URL from IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href"). But when the size of HTML data from "http://old.statarea.com/”的 HTML 数据大小接近 2 MB 时，var placeOfUrl = "5" 与来自 IMPORTXML("http://old.statarea.com/","//tr/td/a[4]/@href") 的 URL 相同。但是关于这种情况，我不确定是否总是这样。对此我深表歉意。
当URL的页面规格改变时，脚本无法使用。所以请注意这一点。

参考文献：

如果我误解了您的问题并且这不是您想要的方向，我深表歉意。

Error: The resource in the URL content has exceeded the maximum size. - Is it possible to reproduce IMPORTXML by script to avoid problems like this?

Error: The resource in the URL content has exceeded the maximum size. - Is it possible to reproduce IMPORTXML by script to avoid problems like this?

google-sheets

google-apps-script

google-sheets-formula

google-sheets-importxml

流量：

用法：

1。安装 "Parser"

2。示例脚本 1：

3。示例脚本 2：

注：

参考文献：