逐行读取 gzip 流

Read gzip stream line by line

我有一个压缩的 gzip 文件,我想逐行阅读它。

var fs = require('fs')
var zlib = require('zlib')
var gunzip = zlib.createGunzip()
var inp = fs.createReadStream('test.gz')
var n = 0

var lineProcessing = function (err, data) {
    if (!err) {
        n += 1
        console.log ("line: " + n)
        console.log (data.toString())
    }
}

inp
  .on('data', function (chunk) {
      zlib.gunzip (chunk, lineProcessing)
  })
  .on('end', function () {
    console.log ('ende');
  });

我想我需要为 zlib.createGunzip 设置一个块大小,我只读到下一个 \n。但是如何动态确定呢?

为此使用 readline 可能更容易:

const fs       = require('fs');
const zlib     = require('zlib');
const readline = require('readline');

let lineReader = readline.createInterface({
  input: fs.createReadStream('test.gz').pipe(zlib.createGunzip())
});

let n = 0;
lineReader.on('line', (line) => {
  n += 1
  console.log("line: " + n);
  console.log(line);
});

如果有人在多年后仍在研究如何做到这一点,并且想要一个适用于 async/await 的解决方案,这就是我正在做的事情(TypeScript,但你可以放弃类型注释)。

import fs from "fs";
import zlib from "zlib";
import readline from "readline";

const line$ = (path: string) => readline.createInterface({
    input: fs.createReadStream(path).pipe(zlib.createGunzip()),
    crlfDelay: Infinity
});

const yourFunction = async () => {
    for await (const line of line$("/path/to/file.txt.gz")) {
        // do stuff with line
    }
}

在 TypeScript 中逐行读取纯文本或 gzip 文件:

import * as fs from 'fs';
import * as zlib from 'zlib'
import * as readline from 'readline'

function readFile(path: string) {
    let stream: NodeJS.ReadableStream = fs.createReadStream(path)
    
    if(/\.gz$/i.test(path)) {
        stream = stream.pipe(zlib.createGunzip())
    }

    return readline.createInterface({
        input: stream,
        crlfDelay: Infinity
    })
}

async function main() {
    const lineReader = readFile('/usr/share/man/man1/less.1.gz')

    for await(const line of lineReader) {
        console.log(line)
    }
}

main().catch(err => {
    console.error(err);
    process.exit(1)
})