如何将输入对象传递给 webworker 以便它可以从文件中读取切片 Javascript

How to pass input object to webworker so it can read slices from a file Javascript

所以我使用

创建了一个输入对象
var s_curFile;

function JSprocessFilePicker( input )
{
    let url = input.value;
    let ext = url.substring( url.lastIndexOf( '.' ) + 1 ).toLowerCase();
    if ( input.files && input.files[0] && ( ext == "txt" ) )
    {
        s_curFile = input.files[0];

        //TODO send s_curFile to workers
    }
}

var input = document.createElement( "input" );
input.setAttribute( "id", "file_picker" );
input.setAttribute( "type", "file" );
input.setAttribute( "accept", ".txt" );
input.setAttribute( "onchange", "JSprocessFilePicker(this)" );
input.click();

我想将 s_curFile 发送给网络工作者,这样我就可以在主线程和工作者上同时使用 XMLHTTPRequest 从它读取切片,例如:

//on both worker and main thread
let xhrReq = new XMLHttpRequest();
xhrReq.overrideMimeType('text/plain; charset=x-user-defined');
//qwOffset and hSize are determined on the thread
let uri = URL.createObjectURL(s_curFile.slice(qwOffset, qwOffset + hSize));
xhrReq.open('GET', uri, false); //can i make it async on workers?
xhrReq.send();
URL.revokeObjectURL(uri);
let Idx;
let sz = xhrReq.response.length;
for (Idx = 0; Idx < sz; ++Idx) {
    //do stuff with response
}

我只是在阅读文件。那么,我将如何发送 s_curFile 给工作人员,这样我才能做到这一点?我认为您必须使用 .postMessage(...) 从主线程到使用 SharedArrayBuffer 的工作程序,但我将如何填充缓冲区?或者是否有其他方法可以做到这一点,因为我相当确定 XMLHttpRequest 可以从工人那里完成。 (我需要这个功能,因为用户可以拥有的本地文件的大小超过 30 GB,所以由于每个选项卡的内存限制,我不能把它全部放在内存中,我希望工作人员帮助处理大量数据)

您可以简单地 postMessage() 您的 File 对象。不会复制底层数据,只会复制包装器对象。

但是请注意,您不应使用 XMLHttpRequest 来读取文件。在旧版浏览器中,您将使用 FileReader (or even FileReaderSync in Web Workers), and their .readAsText() method. In recent browsers you'd use either the File's .text() 方法,该方法 return 将内容解析为 UTF-8 文本。

然而,要将文本文件读取为块,您需要处理 multi-bytes 个字符。在中间切割这样的字符会破坏它:

(async () => {
  const file = new File([""], "file.txt");
  const chunk1 = file.slice(0, file.size/2);
  const chunk2 = file.slice(file.size/2);
  const txt1 = await chunk1.text();
  const txt2 = await chunk2.text();
  const all  = await file.text();
  console.log({txt1, txt2, all});
})();

要避免这种情况,您需要使用 TextDecoder, which is able to keep in memory just the last byte of information to be able to reconstruct the proper character, thanks to its stream option available in the .decode() 方法。

(async () => {
  const file = new File([""], "file.txt");
  const decoder = new TextDecoder();
  const chunk1 = file.slice(0, file.size/2);
  const chunk2 = file.slice(file.size/2);
  const txt1 = decoder.decode(await chunk1.arrayBuffer(), { stream: true});
  const txt2 = decoder.decode(await chunk2.arrayBuffer(), { stream: true});
  const all  = await file.text();
  // now txt1 is empty and txt2 contains the whole glyph
  console.log({txt1, txt2, all});
})();

但是 TextDecoders 无法在 Worker 之间共享,因此它们不会真正帮助我们处理将文件拆分到不同 Worker 时可能遇到的分块问题。不幸的是,我不知道针对这种情况的简单解决方案,所以如果速度提升值得冒破坏几个字符的风险,那是你的电话,我知道在我所在的地区,不能冒这个风险,因为大多数角色都受到关注。

无论如何,这是一个冒这个风险的解决方案,它将把你的文件分成尽可能多的可用 CPU 核心,每个核心将自己的块作为流处理,并 returning它找到的“A”的数量。

const inp = document.querySelector("input");
// limit our number of parallel Workers to the number of cores - 1 (for UI)
const availableThreads = navigator.hardwareConcurrency - 1;
const workerUrl = buildWorkerURL();
const workers = Array.from({length: availableThreads}, () => new Worker(workerUrl));

inp.addEventListener("change", async (evt) => {
  const file = inp.files[0];
  if (!file.name.endsWith(".txt")) {
    console.log("not a .txt file");
    return;
  }
  const chunkSize = Math.ceil(file.size / workers.length);
  const numberOfAs = (await Promise.all(workers.map((worker, i) => {
    return new Promise((res, rej) => {
      // we use a MessageChannel to be able to promisify the request to the Worker
      // this way we can handle different parallel requests
      const { port1, port2 } = new MessageChannel();
      worker.onerror = rej;
      port2.onmessage = ({data}) => {
        if(isNaN(data)) {
          // You could handle progress events here if you wish
          rej(data);
        }
        res(data);
      };
      // we send only a chunk for convenience
      // the actual data never moves anyway
      const chunk = file.slice(chunkSize * i, chunkSize * (i + 1));
      worker.postMessage(chunk, [port1]);
    });
  })))
    // each worker sent its own count, we have to do the sum here
    .reduce((a, b) => a + b, 0);
  console.log(`The file ${file.name} contains ${numberOfAs} "A"s`);
});


function buildWorkerURL() {
  const scriptContent = document.querySelector("script[type=worker]").textContent;
  const blob = new Blob([scriptContent], {type: "text/javascript"});
  return URL.createObjectURL(blob);
}
<input type=file>
<!-- our worker script -->
<script type=worker>
  onmessage = ({data, ports}) => {
    let found = 0;
    const stream = data.stream();
    const reader = stream.getReader();
    const decoder = new TextDecoder();
    reader.read().then(processChunk);
    
    function processChunk({done, value}) {
      // 'value' is an Uint8Array
      // we decode it as UTF-8 text, with the 'stream' option
      const chunk = decoder.decode(value, { stream: true });
      // do some processing over the chunk of text
      // be careful to NOT leak the data here
      found += (chunk.match(/(a|A)/g)||"").length;
      if (done) {
        // use the sent MessagePort to be able to "promisify"
        // the whole process
        ports[0].postMessage(found);
      }
      else {
        // do it again
        reader.read().then(processChunk);
      }
    }
  };
</script>