RLE 实施偶尔会失败

Question

我为二进制文件的 RLE 压缩实现了以下两个函数。

char* RLEcompress(char* data, size_t origSize, size_t* compressedSize) {
    char* ret = calloc(2 * origSize, 1);
    size_t retIdx = 0, inIdx = 0;
    size_t retSize = 0;
    while (inIdx < origSize) {
        size_t count = 1;
        size_t contIdx = inIdx;
        while (contIdx < origSize - 1 && data[inIdx] == data[++contIdx]) {
            count++;
        }
        size_t tmpCount = count;

        // break down counts with 2 or more digits into counts ≤ 9
        while (tmpCount > 9) {
            tmpCount -= 9;
            ret[retIdx++] = data[inIdx];
            ret[retIdx++] = data[inIdx];
            ret[retIdx++] = '9';
            retSize += 3;
        }

        ret[retIdx++] = data[inIdx];
        retSize += 1;
        if (tmpCount > 1) {
            // repeat character (this tells the decompressor that the next digit
            // is in fact the # of consecutive occurrences of this char)
            ret[retIdx++] = data[inIdx];
            // convert single-digit count to dataing
            ret[retIdx++] = '0' + tmpCount;
            retSize += 2;
        }

        inIdx += count;
    }
    *compressedSize = retSize;
    return ret;
}

char* RLEdecompress(char* data, size_t compressedSize, size_t uncompressedSize, size_t extraAllocation) {
    char* ret = calloc(uncompressedSize + extraAllocation, 1);
    size_t retIdx = 0, inIdx = 0;
    while (inIdx < compressedSize) {
        ret[retIdx++] = data[inIdx];
        if (data[inIdx] == data[inIdx + 1]) { // next digit is the # of occurrences
            size_t occ = ((data[inIdx + 2]) - '0');
            for (size_t i = 1; i < occ && retIdx < compressedSize; i++) {
                ret[retIdx++] = data[inIdx];
            }
            inIdx += 2;
        }
        inIdx += 1;
    }
    return ret;
}

它们似乎工作正常，即 diff 在将原始文件与压缩后未压缩的版本进行比较时不会产生任何输出。

但是，每隔一段时间，文件会有所不同，表明某处存在错误。我无法在展示此内容的文件中找到模式，但我将举例说明两者之间的区别。

下图为原图

如您所见，字节 21 在先压缩后未压缩的版本中重复了两次。我无法确定问题所在。不幸的是，这个错误发生在很少的文件上：到目前为止，我只在两个 pdf 文件中观察到它，包括上面显示的那个，但我不能分享它们，因为它是受版权保护的内容，但我正在努力想出另一个文件失败，所以我可以为您提供示例。

我感觉上面的代码有一些“明显”的错误，我只是想念它。非常感谢帮助。

编辑：

这是我用来读取有问题的文件、压缩它然后解压缩它的测试程序。我还在中间步骤将压缩后的文件保存到磁盘以获得更多调试数据。

int main(int argc, char** argv) {
    size_t compsz;
    FILE* fp = fopen(argv[1], "r");
    if (!fp) {
        perror("fp");
        return 1;
    }

    if (fseek(fp, 0L, SEEK_END) == -1) {
        return -1;
    }
    // get file size
    size_t filecontentLen = ftell(fp);
    if (filecontentLen < 0) {
        return -1;
    }
    rewind(fp);

    char* filecontentBuf = calloc(filecontentLen, 1);
    if (!filecontentBuf) {
        fclose(fp);
        errno = ENOMEM;
        return -1;
    }
    // read original
    if (fread(filecontentBuf, sizeof(char), filecontentLen, fp) <= 0) {
        int errnosave = errno;
        if (ferror(fp)) {
            fclose(fp);
            free(filecontentBuf);
            errno = errnosave;
            return -1;
        }
    }
    // write compressed
    char* compressed = RLEcompress(filecontentBuf, filecontentLen, &compsz);
    FILE* fpcompWrite = fopen("compressed", "w+");
    if (fwrite(compressed, compsz, 1, fpcompWrite) == -1) {
        perror("fwrite");
    }
    fclose(fpcompWrite);

    // read compressed
    FILE* fpcompRead = fopen("compressed", "r");
    if (!fpcompRead) {
        perror("fpcompRead");
        return 1;
    }
    char* compBuf = calloc(compsz * 2, 1);
    fread(compBuf, compsz, 1, fpcompRead);
    fclose(fpcompRead);

    // decompress and write file
    char* uncompBuf = RLEdecompress(compBuf, compsz, filecontentLen, 0);
    FILE* funcomp = fopen("uncompressed", "w+");
    fwrite(uncompBuf, filecontentLen, 1, funcomp);
    fclose(funcomp);
}

Answer 1

我认为问题在于

for (size_t i = 1; i < occ && retIdx < compressedSize; i++) {
    ret[retIdx++] = data[inIdx];
}

应在

中更改

for (size_t i = 1; i < occ && retIdx < uncompressedSize; i++) {
    ret[retIdx++] = data[inIdx];
}

在解压缩算法中，由于 redIdx 受到 uncompressedSize 的限制，并且可能在某些极少数情况下它复制的字节数少于应有的字节数。

RLE 实施偶尔会失败

RLE implementation failing every once in a while

c

run-length-encoding