使用 zlib 和 Crypto++ 缩小文件的大小差异

Size discrepancy in file deflated using zlib and Crypto++

我正在学习如何在 C++ 中原始放气(没有 header 或预告片信息)和膨胀数据,所以我决定尝试 zlibCrypto++ 库。 我发现,压缩同一个文件时,Crypto++ 有时会额外增加 4 个字节(取决于使用的方法)。

例如,对于包含以下序列的文件,包含空格:1 2 3 4 5 6,使用 zlib 压缩生成大小为 14 字节的文件。

这适用于 Crypto++ deflate_method1,但对于 Crypto++ deflate_method2,文件大小为 18 字节。

此外,当尝试扩充使用 Crypto++ deflate_method2Crypto++ inflate_method1 缩小的文件时,会引发异常:

terminate called after throwing an instance of 'CryptoPP::Inflator::UnexpectedEndErr'
  what():  Inflator: unexpected end of compressed block
Aborted (core dumped)

为了比较,我做了另一个测试 deflating/inflating 和 Python:

至此,我想明白两件事:

  1. 为什么压缩后的文件大小不一致?

  2. 为什么 Python 能够膨胀任何文件,但 Crypto++ 很挑剔?


信息和代码:

输入和输出文件为 base64:

Zlib:

#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <iterator>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include "zlib.h"

constexpr uint32_t BUFFER_READ_SIZE  = 128;
constexpr uint32_t BUFFER_WRITE_SIZE = 128;

bool mydeflate(std::vector<unsigned char> & input)
{
    const std::string inputStream{ input.begin(), input.end() };
    uint64_t inputSize = input.size();

    // Create a string stream where output will be created.
    std::stringstream outputStringStream(std::ios::in | std::ios::out | std::ios::binary);

    // Initialize zlib structures.
    std::vector<char *> readBuffer(BUFFER_READ_SIZE);
    std::vector<char *> writeBuffer(BUFFER_WRITE_SIZE);

    z_stream zipStream;
    zipStream.avail_in = 0;
    zipStream.avail_out = BUFFER_WRITE_SIZE;
    zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
    zipStream.total_in = 0;
    zipStream.total_out = 0;
    zipStream.data_type = Z_BINARY;
    zipStream.zalloc = nullptr;
    zipStream.zfree =  nullptr;
    zipStream.opaque = nullptr;

    // Window bits is passed < 0 to tell that there is no zlib header.
    if (deflateInit2_(&zipStream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, 8, Z_DEFAULT_STRATEGY, ZLIB_VERSION, sizeof(zipStream)) != Z_OK)
    {
        return false;
    }

    // Deflate the input stream
    uint32_t readSize = 0;
    uint64_t dataPendingToCompress = inputSize;
    uint64_t dataPendingToWrite = 0;
    bool     isEndOfInput = false;

    while (dataPendingToCompress > 0)
    {
        if (dataPendingToCompress > BUFFER_READ_SIZE)
        {
            readSize = BUFFER_READ_SIZE;
        }
        else
        {
            readSize = dataPendingToCompress;
            isEndOfInput = true;
        }

        // Copy the piece of input stream to the read buffer.
        std::memcpy(readBuffer.data(), &inputStream[inputSize - dataPendingToCompress], readSize);
        dataPendingToCompress -= readSize;

        zipStream.next_in = reinterpret_cast<Bytef *>(readBuffer.data());
        zipStream.avail_in = readSize;

        // While there is input data to compress.
        while (zipStream.avail_in > 0)
        {
            // Output buffer is full.
            if (zipStream.avail_out == 0)
            {
                outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), dataPendingToWrite);

                zipStream.total_in = 0;
                zipStream.avail_out = BUFFER_WRITE_SIZE;
                zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
                dataPendingToWrite = 0;
            }

            uint64_t totalOutBefore = zipStream.total_out;

            int zlibError = deflate(&zipStream, isEndOfInput ? Z_FINISH : Z_NO_FLUSH);

            if ((zlibError != Z_OK) && (zlibError != Z_STREAM_END))
            {
                deflateEnd(&zipStream);

                return false;
            }

            dataPendingToWrite += static_cast<uint64_t>(zipStream.total_out - totalOutBefore);
        }
    }

    // Flush last compressed data.
    while (dataPendingToWrite > 0)
    {
        if (dataPendingToWrite > BUFFER_WRITE_SIZE)
        {
            outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), BUFFER_WRITE_SIZE);
        }
        else
        {
            outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), dataPendingToWrite);
        }

        zipStream.total_in = 0;
        zipStream.avail_out = BUFFER_WRITE_SIZE;
        zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());

        uint64_t totalOutBefore = zipStream.total_out;
        int zlibError = deflate(&zipStream, Z_FINISH);

        if ((zlibError != Z_OK) && (zlibError != Z_STREAM_END))
        {
            deflateEnd(&zipStream);

            return false;
        }

        dataPendingToWrite = static_cast<uint64_t>(zipStream.total_out - totalOutBefore);
    }

    deflateEnd(&zipStream);

    const std::string & outputString = outputStringStream.str();
    std::vector<unsigned char> deflated{outputString.begin(), outputString.end()};
    
    std::cout << "Output String size: " << outputString.size() << std::endl;

    input.swap(deflated);

    return true;
}

int main(int argc, char * argv[])
{
    std::ifstream input_file{"/tmp/test.txt"};
    std::vector<unsigned char> data((std::istreambuf_iterator<char>(input_file)), std::istreambuf_iterator<char>());
    std::cout << "Deflated: " << mydeflate(data) << '\n';
    
    std::ofstream output_file{"/tmp/deflated.txt"};
    output_file.write(reinterpret_cast<char *>(data.data()), data.size());
    
    return 0;
}

加密++:

#include "cryptopp/files.h"
#include "cryptopp/zdeflate.h"
#include "cryptopp/zinflate.h"

void deflate_method1(const std::string & input_file_path, const std::string & output_file_path)
{
    CryptoPP::Deflator deflator(new CryptoPP::FileSink(output_file_path.c_str(), true), CryptoPP::Deflator::DEFAULT_DEFLATE_LEVEL, CryptoPP::Deflator::MAX_LOG2_WINDOW_SIZE);
    CryptoPP::FileSource fs(input_file_path.c_str(), true);
    fs.TransferAllTo(deflator);
}

void inflate_method1(const std::string & input_file_path, const std::string & output_file_path)
{
    CryptoPP::FileSource fs(input_file_path.c_str(), true);
    CryptoPP::Inflator inflator(new CryptoPP::FileSink(output_file_path.c_str(), true));
    fs.TransferAllTo(inflator);
}

void deflate_method2(const std::string& input_file_path, const std::string& output_file_path)
{
  CryptoPP::Deflator deflator(new CryptoPP::FileSink(output_file_path.c_str(), true), CryptoPP::Deflator::DEFAULT_DEFLATE_LEVEL, 15);

  std::ifstream file_in;
  file_in.open(input_file_path, std::ios::binary);

  std::string buffer;
  size_t num_read = 0;

  const size_t buffer_size(1024 * 1024);
  buffer.resize(buffer_size);
  file_in.read(const_cast<char*>(buffer.data()), buffer_size);
  num_read = file_in.gcount();

  while (num_read) {
    deflator.ChannelPut(CryptoPP::DEFAULT_CHANNEL, reinterpret_cast<unsigned char*>(const_cast<char *>(buffer.data())), num_read);
    file_in.read(const_cast<char*>(buffer.data()), buffer_size);
    num_read = file_in.gcount();
  }

  file_in.close();
  deflator.Flush(true);
}

void inflate_method2(const std::string& input_file_path, const std::string& output_file_path)
{
  CryptoPP::Inflator inflator(new CryptoPP::FileSink(output_file_path.c_str(), true));

  std::ifstream file_in;
  file_in.open(input_file_path, std::ios::binary);

  std::string buffer;
  size_t num_read = 0;

  const size_t buffer_size(1024 * 1024);
  buffer.resize(buffer_size);
  file_in.read(const_cast<char*>(buffer.data()), buffer_size);
  num_read = file_in.gcount();

  while (num_read) {
    inflator.ChannelPut(CryptoPP::DEFAULT_CHANNEL, reinterpret_cast<unsigned char*>(const_cast<char *>(buffer.data())), num_read);
    file_in.read(const_cast<char*>(buffer.data()), buffer_size);
    num_read = file_in.gcount();
  }

  file_in.close();
  inflator.Flush(true);
}

int main(int argc, char * argv[])
{
    deflate_method1("/tmp/test.txt", "/tmp/deflated_method1.bin");
    inflate_method1("/tmp/deflated_method1.bin", "/tmp/inflated_method1.txt");

    deflate_method2("/tmp/test.txt", "/tmp/deflated_method2.bin");
    inflate_method2("/tmp/deflated_method2.bin", "/tmp/inflated_method2.txt");
    // This throws:    Inflator: unexpected end of compressed block 
    inflate_method1("/tmp/deflated_method2.bin", "/tmp/inflated_with_method1_file_deflated_with_method2.txt");

    return 0;
}

Python:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import zlib


def CHUNKSIZE():
    return 128


def deflate(file_path, compression_level, method, wbits):
    plain_data = None
    deflated_data = bytearray()
    deflator = zlib.compressobj(compression_level, method, wbits)

    with open(file_path, 'rb') as input_file:
            while True:
                plain_data = input_file.read(CHUNKSIZE())

                if not plain_data:
                    break

                deflated_data += deflator.compress(plain_data)

    deflated_data += deflator.flush()

    return deflated_data


def inflate(file_path, wbits):
    inflated_data = bytearray()
    inflator = zlib.decompressobj(wbits)

    with open(file_path, 'rb') as deflated_file:
        buffer = deflated_file.read(CHUNKSIZE())

        while buffer:
            inflated_data += inflator.decompress(buffer)
            buffer = deflated_file.read(CHUNKSIZE())

        inflated_data += inflator.flush()

    return inflated_data


def write_file(file_path, data):
    with open(file_path, 'wb') as output_file:
        output_file.write(data)


if __name__ == "__main__":
    deflated_data = deflate("/tmp/test.txt", zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS)
    write_file("/tmp/deflated_python.bin", deflated_data)

前三个工作正常,使用最后一个 deflate 块生成有效的 deflate 压缩流。

您的“Crypto++ method2”正在生成两个 deflate 块,其中第二个是一个空的存储块,未标记为最后一个块。这不是有效的放气流,因为它不会终止。您没有正确完成压缩。

您的 deflator.Flush(true) 正在刷新第一个块并发出空的存储块,而不会结束压缩流。

我没有看到太多文档,或者根本没有看到任何文档,但是查看源代码,我会尝试 deflator.EndBlock(true)

更新:

根据下面的评论,EndBlock 不是 public。相反,MessageEnd 是终止放气流所需要的。