如何使用 node.js 和 cloudflare worker 在现有 HTML 响应中注入 javascript

How to inject javascript in existing HTML response with node.js and cloudflare workers

我有一个指向 GitBook 的虚荣心 URL。 GitBook 不支持插入任意 javascript 片段。目前 GitBook 只有 4 "integrations"。

我可以通过我自己的 VM 服务器路由来完成此操作,但我有 CloudFlare,我想试用 workers。 (CDN 边缘Javascript 运行)。

CloudFlare worker 环境使 header 注入非常容易,但没有明显的方法可以做到这一点。

使用 TransformStream 进行处理很重要,这样处理是异步的并且不需要内存缓冲(为了可扩展性和最小化 GC)- 只有 5 毫秒 CPU 时间预算。

概览:

  • 要自己使用,请更改字符串 forHeadStartforHeadEndforBodyEnd
  • 这种 deferredInjection 方法是推荐的方法,可以最大限度地减少工作人员的 CPU 时间。它更有效,因为它只需要解析 HTML 的开头。另一种方法需要为 headInjection 解析整个头部部分,如果您使用 bodyInjection,它实际上需要解析整个 html 响应。
  • deferredInjection 方法的工作原理是将内容注入到 head 标记的开头,然后在 client-side 运行时将您的 HTML 内容部署到所需位置。
  • 如果需要,您可以使用 headInjection and/or bodyInjection 直接注入。取消注释相关代码,包括 injectScripts 中的代码,并设置将被编码的 tagBytes 的字符串。
  • 此解决方案将仅解析 HTML 内容类型
  • 此解决方案直接处理字节(而非字符串)以提高效率。正在搜索 end-tag 字符串的字节。
  • 您可以定位更多 end-tag,但通常您不需要定位超过这两个
  • 使用流处理数据(整个 HTML 字符串未缓存在内存中)。这降低了峰值内存使用并加快了第一个字节的时间。
  • 处理结束标记位于文本读取边界上的罕见边缘情况。我相信每 ~1000 字节可能会出现一个边界(每个 TCP 数据包 1000-1500 字节),这可能会因 gzip 压缩而有所不同。
  • 将注入解析代码分开,以便代码简单地转发其余代码。
  • 如果不需要,您可以通过注释掉第二个 body-tag 注入器来禁用它 - 这将加快处理速度。
  • 我已经为自己测试了这个确切的代码并且它有效。可能还有剩余的错误(取决于结束标记的位置,并且取决于您的服务器是否使用部分 html 模板(仅 body )回复)。我今天可能修好了一个 2019-06-28

代码

addEventListener('fetch', event => {
  event.passThroughOnException();
  event.respondWith(handleRequest(event.request))
})

/**
 * Fetch and log a request
 * @param {Request} request
 */
async function handleRequest(request) {
  const response = await fetch(request);

  var ctype = response.headers.get('content-type');
  if (ctype.startsWith('text/html') === false)
    return response; //Only parse html body

  let { readable, writable } = new TransformStream();
  let promise = injectScripts(response.body, writable);
  return new Response(readable, response);
}

let encoder = new TextEncoder('utf-8');

let deferredInjection = function() {
    let forHeadStart = `<script>var test = 1; //Start of head section</script>`;
    let forHeadEnd = `<script>var test = 2; //End of head section</script>`;
    let forBodyEnd = `<script>var test = 3; //End of body section</script><button>click</button>`;

    let helper = `
    ${forHeadStart}
    <script>
        function appendHtmlTo(element, htmlContent) {
            var temp = document.createElement('div');
            temp.innerHTML = htmlContent;
            while (temp.firstChild) {
                element.appendChild(temp.firstChild);
            };
        }

        let forHeadEnd = "${ btoa(forHeadEnd) }";
        let forBodyEnd = "${ btoa(forBodyEnd) }";

        if (forHeadEnd.length > 0) appendHtmlTo(document.head, atob(forHeadEnd)); 
    if (forBodyEnd.length > 0) window.onload = function() {
      appendHtmlTo(document.body, atob(forBodyEnd));
    };

    </script>
    `;
    return {
        forInjection: encoder.encode(helper),
        tagBytes: encoder.encode("<head>"),
        insertAfterTag: true
    };

}();

// let headInjection = {
    // forInjection: encoder.encode("<script>var test = 1;</script>"),
    // tagBytes: encoder.encode("</head>"), //case sensitive
    // insertAfterTag: false
// };
// let bodyInjection = {
    // forInjection: encoder.encode("<script>var test = 1;</script>"),
    // tagBytes: encoder.encode("</body>"), //case sensitive
    // insertAfterTag: false
// }

//console.log(bodyTagBytes);
encoder = null;

async function injectScripts(readable, writable) {
  let processingState = {
    readStream: readable,
    writeStream: writable,
    reader: readable.getReader(),
    writer: writable.getWriter(),
    leftOvers: null, //data left over after a closing tag is found
    inputDone: false,
    result: {charactersFound: 0, foundIndex: -1, afterHeadTag: -1} //Reused object for the duration of the request
  };


  await parseForInjection(processingState, deferredInjection);

  //await parseForInjection(processingState, headInjection);

  //await parseForInjection(processingState, bodyInjection);

  await forwardTheRest(processingState);      
}



///Return object will have foundIndex: -1, if there is no match, and no partial match at the end of the array
///If there is an exact match, return object will have charactersFound:(tagBytes.Length)
///If there is a partial match at the end of the array, return object charactersFound will be < (tagBytes.Length)
///The result object needs to be passed in to reduce Garbage Collection - we can reuse the object
function searchByteArrayChunkForClosingTag(chunk, tagBytes, result)
{   
  //console.log('search');
    let searchStart = 0;
  //console.log(tagBytes.length);
    //console.log(chunk.length);

    for (;;) {
        result.charactersFound = 0;
        result.foundIndex = -1;
        result.afterHeadTag = -1;
    //console.log(result);

        let sweepIndex = chunk.indexOf(tagBytes[0], searchStart);
        if (sweepIndex === -1)
            return; //Definitely not found

        result.foundIndex = sweepIndex;
        sweepIndex++;
        searchStart = sweepIndex; //where we start searching from next
        result.charactersFound++;   
        result.afterHeadTag = sweepIndex;

    //console.log(result);

        for (let i = 1; i < tagBytes.length; i++)
        {
            if (sweepIndex === chunk.length) return; //Partial match
            if (chunk[sweepIndex++] !== tagBytes[i]) { result.charactersFound = 0; result.afterHeadTag = -1; break; } //Failed to match (even partially to boundary)
            result.charactersFound++;
            result.afterHeadTag = sweepIndex; //Because we work around the actual found tag in case it's across a boundary
        }   

    if (result.charactersFound === tagBytes.length)
          return; //Found
    }

}

function continueSearchByteArrayChunkForClosingTag(chunk, tagBytes, lastSplitResult, result)
{
  //console.log('continue');
    //Finish the search (no need to check the last buffer at all)
    //console.log('finish the search');
    result.charactersFound = lastSplitResult.charactersFound; //We'll be building on the progress from the lastSplitResult
    result.foundIndex = (-1 * result.charactersFound); //This won't be used, but a negative value is indicative of chunk spanning
    let sweepIndex = 0;
    result.afterHeadTag = 0;
    for (let i = lastSplitResult.charactersFound; i < tagBytes.length; i++) //Zero-based
    {
        if (sweepIndex === chunk.length) return result; //So we support working on a chunk that's smaller than the tagBytes search size
        if (chunk[sweepIndex++] !== tagBytes[i]) { result.charactersFound = 0; result.afterHeadTag = -1; break; }
        result.charactersFound++;
        result.afterHeadTag = sweepIndex;
    }
}

function continueOrNewSearch(chunk, tagBytes, lastSplitResult, result)
{
  //console.log('continueOrNewSearch');
      if (lastSplitResult == null)
          searchByteArrayChunkForClosingTag(chunk, tagBytes, result);
      else
      {
          continueSearchByteArrayChunkForClosingTag(chunk, tagBytes, lastSplitResult, result);
        if (result.charactersFound === tagBytes.length)
            return result;
        else
            return searchByteArrayChunkForClosingTag(chunk, tagBytes, result); //Keep searching onward
      }
}

async function parseForInjection(processingState, injectionJob)
{
  if (processingState.inputDone) return; //Very edge case: Somehow </head> is never found?            
  if (!injectionJob) return;
  if (!injectionJob.tagBytes) return;
  if (!injectionJob.forInjection) return;

  let reader = processingState.reader;
  let writer = processingState.writer;
  let result = processingState.result;
  let tagBytes = injectionJob.tagBytes;
  //(reader, writer, tagBytes, forInjection)

  let lastSplitResult = null;
  let chunk = null;
  processingState.inputDone = false;
  for (;;) {
    if (processingState.leftOvers)
      {
      chunk = processingState.leftOvers;
      processingState.leftOvers = null;
      }
      else
      {
      let readerResult = await reader.read();
      chunk = readerResult.value;
      processingState.inputDone = readerResult.done;
      }

      if (processingState.inputDone) {
        if (lastSplitResult !== null) {
            //Very edge case: Somehow tagBytes is never found?            
            console.log('edge');
                  throw 'tag not found'; //Causing the system to fall back to the direct request
        }
        await writer.close();
        return true;
      }   
      //console.log(value.length);

        continueOrNewSearch(chunk, tagBytes, lastSplitResult, result)
      //console.log(result);

      if (result.charactersFound === tagBytes.length) //Complete match
      {
        //Inject
        //console.log('inject');
        if (result.foundIndex > 0)
        {
          let partValue = chunk.slice(0, result.foundIndex);
          //console.log(partValue);
          await writer.write(partValue);
        }
        console.log('injected');
        if (parseForInjection.insertAfterTag)
        {
            await writer.write(injectionJob.forInjection);
            await writer.write(injectionJob.tagBytes);
        }
        else
        {
            await writer.write(injectionJob.tagBytes);
            await writer.write(injectionJob.forInjection);
        }
        let remainder = chunk.slice(result.afterHeadTag, chunk.length - 1);
        processingState.leftOvers = remainder;
        lastSplitResult = null;
        return;
      }

      if (lastSplitResult !== null)
      {
        //console.log('no match over boundary');
        //The remainder wasn't found, so write the partial match from before (maybe `<` or `</`)
        let failedLastBit = injectionJob.tagBytes.slice(0, lastSplitResult.charactersFound);
        await writer.write(failedLastBit);
        lastSplitResult = null;
      }

      if (result.charactersFound === 0)
      {
        //console.log('not found')
        await writer.write(chunk);
        continue;
      }

      if (result.charactersFound < tagBytes.length)
      {
        //console.log('boundary: ' + result.charactersFound);
        lastSplitResult = result;
        let partValue = chunk.slice(0, result.foundIndex);
        //console.log(partValue);
        await writer.write(partValue);
        continue;
      }
  }
}

async function forwardTheRest(processingState)
{
  try
  {
  if (processingState.inputDone) return; //Very edge case: Somehow </head> is never found?            

  if (processingState.leftOvers)
  {
    chunk = processingState.leftOvers;
    await processingState.writer.write(chunk);
  }

  processingState.reader.releaseLock();
  processingState.writer.releaseLock();

  await processingState.readStream.pipeTo(processingState.writeStream);

  //Should there be an explicit close method called? I couldn't find one
  }
  catch (e)
  {
    console.log(e);
  }
}

直接使用 (utf-8) 字节的进一步说明:

  • 仅使用字节值。这至少可以通过搜索字符的第一个独特的 utf-8 字节(< 128 和 > 192)来实现。但是在这种情况下,我们正在搜索由 lower-than-128 个字节组成的 </head>,非常容易使用。
  • 鉴于搜索 utf-8 的性质(这是最棘手的),这应该适用于 ['utf-8'、'utf8'、'iso-8859-1'、'us-ascii'] .您将需要更改代码段编码器以匹配。
  • 这还没有经过全面测试。边界案对我没有触发。理想情况下,我们会有一个核心功能的测试平台
  • 感谢 Kenton Varda 挑战我
  • 请告诉我是否有 CloudFlare 工作人员在 forwardTheRest 函数中执行 pipeTo 的方法
  • 您可能会发现 continueOrNewSearch 和两个 sub-functions 是跨块边界查找 multi-bytes 的有趣方法。直到边界,我们才计算找到了多少字节。没有必要保留这些字节(我们知道它们是什么)。然后在下一个块中,我们从中断的地方继续。我们总是在 header 周围切割数组缓冲区,并确保我们写入 header 字节(使用 tagBytes)