使用 zlib 和 Crypto++ 缩小文件的大小差异
Size discrepancy in file deflated using zlib and Crypto++
我正在学习如何在 C++ 中原始放气(没有 header 或预告片信息)和膨胀数据,所以我决定尝试 zlib
和 Crypto++
库。
我发现,压缩同一个文件时,Crypto++
有时会额外增加 4 个字节(取决于使用的方法)。
例如,对于包含以下序列的文件,包含空格:1 2 3 4 5 6
,使用 zlib
压缩生成大小为 14 字节的文件。
这适用于 Crypto++ deflate_method1
,但对于 Crypto++ deflate_method2
,文件大小为 18 字节。
此外,当尝试扩充使用 Crypto++ deflate_method2
和 Crypto++ inflate_method1
缩小的文件时,会引发异常:
terminate called after throwing an instance of 'CryptoPP::Inflator::UnexpectedEndErr'
what(): Inflator: unexpected end of compressed block
Aborted (core dumped)
为了比较,我做了另一个测试 deflating/inflating 和 Python:
- 放气也会产生一个大小为 14 字节的文件。
- 我能够正确地扩充所有缩小的文件,无论使用何种方法缩小它们。
至此,我想明白两件事:
为什么压缩后的文件大小不一致?
为什么 Python 能够膨胀任何文件,但 Crypto++ 很挑剔?
信息和代码:
- OS: Ubuntu 16.04 Xenial
- Zlib 版本: 1.0.1 来自 Ubuntu repos.
- Crypto++ 版本: 8.0.2 来自 GitHub 版本。
- Python版本:3.5.2
- zlib 版本:1.2.8 / 运行时版本:1.2.8
输入和输出文件为 base64:
- 输入:
MSAyIDMgNCA1IDYK
- 泄气:
- Python:
M1QwUjBWMFEwVTDjAgA=
- Zlib:
M1QwUjBWMFEwVTDjAgA=
- 加密++方法1:
M1QwUjBWMFEwVTDjAgA=
- 加密++方法2:
MlQwUjBWMFEwVTDjAgAAAP//
Zlib:
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <iterator>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include "zlib.h"
constexpr uint32_t BUFFER_READ_SIZE = 128;
constexpr uint32_t BUFFER_WRITE_SIZE = 128;
bool mydeflate(std::vector<unsigned char> & input)
{
const std::string inputStream{ input.begin(), input.end() };
uint64_t inputSize = input.size();
// Create a string stream where output will be created.
std::stringstream outputStringStream(std::ios::in | std::ios::out | std::ios::binary);
// Initialize zlib structures.
std::vector<char *> readBuffer(BUFFER_READ_SIZE);
std::vector<char *> writeBuffer(BUFFER_WRITE_SIZE);
z_stream zipStream;
zipStream.avail_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
zipStream.total_in = 0;
zipStream.total_out = 0;
zipStream.data_type = Z_BINARY;
zipStream.zalloc = nullptr;
zipStream.zfree = nullptr;
zipStream.opaque = nullptr;
// Window bits is passed < 0 to tell that there is no zlib header.
if (deflateInit2_(&zipStream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, 8, Z_DEFAULT_STRATEGY, ZLIB_VERSION, sizeof(zipStream)) != Z_OK)
{
return false;
}
// Deflate the input stream
uint32_t readSize = 0;
uint64_t dataPendingToCompress = inputSize;
uint64_t dataPendingToWrite = 0;
bool isEndOfInput = false;
while (dataPendingToCompress > 0)
{
if (dataPendingToCompress > BUFFER_READ_SIZE)
{
readSize = BUFFER_READ_SIZE;
}
else
{
readSize = dataPendingToCompress;
isEndOfInput = true;
}
// Copy the piece of input stream to the read buffer.
std::memcpy(readBuffer.data(), &inputStream[inputSize - dataPendingToCompress], readSize);
dataPendingToCompress -= readSize;
zipStream.next_in = reinterpret_cast<Bytef *>(readBuffer.data());
zipStream.avail_in = readSize;
// While there is input data to compress.
while (zipStream.avail_in > 0)
{
// Output buffer is full.
if (zipStream.avail_out == 0)
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), dataPendingToWrite);
zipStream.total_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
dataPendingToWrite = 0;
}
uint64_t totalOutBefore = zipStream.total_out;
int zlibError = deflate(&zipStream, isEndOfInput ? Z_FINISH : Z_NO_FLUSH);
if ((zlibError != Z_OK) && (zlibError != Z_STREAM_END))
{
deflateEnd(&zipStream);
return false;
}
dataPendingToWrite += static_cast<uint64_t>(zipStream.total_out - totalOutBefore);
}
}
// Flush last compressed data.
while (dataPendingToWrite > 0)
{
if (dataPendingToWrite > BUFFER_WRITE_SIZE)
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), BUFFER_WRITE_SIZE);
}
else
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), dataPendingToWrite);
}
zipStream.total_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
uint64_t totalOutBefore = zipStream.total_out;
int zlibError = deflate(&zipStream, Z_FINISH);
if ((zlibError != Z_OK) && (zlibError != Z_STREAM_END))
{
deflateEnd(&zipStream);
return false;
}
dataPendingToWrite = static_cast<uint64_t>(zipStream.total_out - totalOutBefore);
}
deflateEnd(&zipStream);
const std::string & outputString = outputStringStream.str();
std::vector<unsigned char> deflated{outputString.begin(), outputString.end()};
std::cout << "Output String size: " << outputString.size() << std::endl;
input.swap(deflated);
return true;
}
int main(int argc, char * argv[])
{
std::ifstream input_file{"/tmp/test.txt"};
std::vector<unsigned char> data((std::istreambuf_iterator<char>(input_file)), std::istreambuf_iterator<char>());
std::cout << "Deflated: " << mydeflate(data) << '\n';
std::ofstream output_file{"/tmp/deflated.txt"};
output_file.write(reinterpret_cast<char *>(data.data()), data.size());
return 0;
}
加密++:
#include "cryptopp/files.h"
#include "cryptopp/zdeflate.h"
#include "cryptopp/zinflate.h"
void deflate_method1(const std::string & input_file_path, const std::string & output_file_path)
{
CryptoPP::Deflator deflator(new CryptoPP::FileSink(output_file_path.c_str(), true), CryptoPP::Deflator::DEFAULT_DEFLATE_LEVEL, CryptoPP::Deflator::MAX_LOG2_WINDOW_SIZE);
CryptoPP::FileSource fs(input_file_path.c_str(), true);
fs.TransferAllTo(deflator);
}
void inflate_method1(const std::string & input_file_path, const std::string & output_file_path)
{
CryptoPP::FileSource fs(input_file_path.c_str(), true);
CryptoPP::Inflator inflator(new CryptoPP::FileSink(output_file_path.c_str(), true));
fs.TransferAllTo(inflator);
}
void deflate_method2(const std::string& input_file_path, const std::string& output_file_path)
{
CryptoPP::Deflator deflator(new CryptoPP::FileSink(output_file_path.c_str(), true), CryptoPP::Deflator::DEFAULT_DEFLATE_LEVEL, 15);
std::ifstream file_in;
file_in.open(input_file_path, std::ios::binary);
std::string buffer;
size_t num_read = 0;
const size_t buffer_size(1024 * 1024);
buffer.resize(buffer_size);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
while (num_read) {
deflator.ChannelPut(CryptoPP::DEFAULT_CHANNEL, reinterpret_cast<unsigned char*>(const_cast<char *>(buffer.data())), num_read);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
}
file_in.close();
deflator.Flush(true);
}
void inflate_method2(const std::string& input_file_path, const std::string& output_file_path)
{
CryptoPP::Inflator inflator(new CryptoPP::FileSink(output_file_path.c_str(), true));
std::ifstream file_in;
file_in.open(input_file_path, std::ios::binary);
std::string buffer;
size_t num_read = 0;
const size_t buffer_size(1024 * 1024);
buffer.resize(buffer_size);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
while (num_read) {
inflator.ChannelPut(CryptoPP::DEFAULT_CHANNEL, reinterpret_cast<unsigned char*>(const_cast<char *>(buffer.data())), num_read);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
}
file_in.close();
inflator.Flush(true);
}
int main(int argc, char * argv[])
{
deflate_method1("/tmp/test.txt", "/tmp/deflated_method1.bin");
inflate_method1("/tmp/deflated_method1.bin", "/tmp/inflated_method1.txt");
deflate_method2("/tmp/test.txt", "/tmp/deflated_method2.bin");
inflate_method2("/tmp/deflated_method2.bin", "/tmp/inflated_method2.txt");
// This throws: Inflator: unexpected end of compressed block
inflate_method1("/tmp/deflated_method2.bin", "/tmp/inflated_with_method1_file_deflated_with_method2.txt");
return 0;
}
Python:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import zlib
def CHUNKSIZE():
return 128
def deflate(file_path, compression_level, method, wbits):
plain_data = None
deflated_data = bytearray()
deflator = zlib.compressobj(compression_level, method, wbits)
with open(file_path, 'rb') as input_file:
while True:
plain_data = input_file.read(CHUNKSIZE())
if not plain_data:
break
deflated_data += deflator.compress(plain_data)
deflated_data += deflator.flush()
return deflated_data
def inflate(file_path, wbits):
inflated_data = bytearray()
inflator = zlib.decompressobj(wbits)
with open(file_path, 'rb') as deflated_file:
buffer = deflated_file.read(CHUNKSIZE())
while buffer:
inflated_data += inflator.decompress(buffer)
buffer = deflated_file.read(CHUNKSIZE())
inflated_data += inflator.flush()
return inflated_data
def write_file(file_path, data):
with open(file_path, 'wb') as output_file:
output_file.write(data)
if __name__ == "__main__":
deflated_data = deflate("/tmp/test.txt", zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS)
write_file("/tmp/deflated_python.bin", deflated_data)
前三个工作正常,使用最后一个 deflate 块生成有效的 deflate 压缩流。
您的“Crypto++ method2”正在生成两个 deflate 块,其中第二个是一个空的存储块,未标记为最后一个块。这不是有效的放气流,因为它不会终止。您没有正确完成压缩。
您的 deflator.Flush(true)
正在刷新第一个块并发出空的存储块,而不会结束压缩流。
我没有看到太多文档,或者根本没有看到任何文档,但是查看源代码,我会尝试 deflator.EndBlock(true)
。
更新:
根据下面的评论,EndBlock
不是 public。相反,MessageEnd
是终止放气流所需要的。
我正在学习如何在 C++ 中原始放气(没有 header 或预告片信息)和膨胀数据,所以我决定尝试 zlib
和 Crypto++
库。
我发现,压缩同一个文件时,Crypto++
有时会额外增加 4 个字节(取决于使用的方法)。
例如,对于包含以下序列的文件,包含空格:1 2 3 4 5 6
,使用 zlib
压缩生成大小为 14 字节的文件。
这适用于 Crypto++ deflate_method1
,但对于 Crypto++ deflate_method2
,文件大小为 18 字节。
此外,当尝试扩充使用 Crypto++ deflate_method2
和 Crypto++ inflate_method1
缩小的文件时,会引发异常:
terminate called after throwing an instance of 'CryptoPP::Inflator::UnexpectedEndErr'
what(): Inflator: unexpected end of compressed block
Aborted (core dumped)
为了比较,我做了另一个测试 deflating/inflating 和 Python:
- 放气也会产生一个大小为 14 字节的文件。
- 我能够正确地扩充所有缩小的文件,无论使用何种方法缩小它们。
至此,我想明白两件事:
为什么压缩后的文件大小不一致?
为什么 Python 能够膨胀任何文件,但 Crypto++ 很挑剔?
信息和代码:
- OS: Ubuntu 16.04 Xenial
- Zlib 版本: 1.0.1 来自 Ubuntu repos.
- Crypto++ 版本: 8.0.2 来自 GitHub 版本。
- Python版本:3.5.2
- zlib 版本:1.2.8 / 运行时版本:1.2.8
输入和输出文件为 base64:
- 输入:
MSAyIDMgNCA1IDYK
- 泄气:
- Python:
M1QwUjBWMFEwVTDjAgA=
- Zlib:
M1QwUjBWMFEwVTDjAgA=
- 加密++方法1:
M1QwUjBWMFEwVTDjAgA=
- 加密++方法2:
MlQwUjBWMFEwVTDjAgAAAP//
- Python:
Zlib:
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <iterator>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include "zlib.h"
constexpr uint32_t BUFFER_READ_SIZE = 128;
constexpr uint32_t BUFFER_WRITE_SIZE = 128;
bool mydeflate(std::vector<unsigned char> & input)
{
const std::string inputStream{ input.begin(), input.end() };
uint64_t inputSize = input.size();
// Create a string stream where output will be created.
std::stringstream outputStringStream(std::ios::in | std::ios::out | std::ios::binary);
// Initialize zlib structures.
std::vector<char *> readBuffer(BUFFER_READ_SIZE);
std::vector<char *> writeBuffer(BUFFER_WRITE_SIZE);
z_stream zipStream;
zipStream.avail_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
zipStream.total_in = 0;
zipStream.total_out = 0;
zipStream.data_type = Z_BINARY;
zipStream.zalloc = nullptr;
zipStream.zfree = nullptr;
zipStream.opaque = nullptr;
// Window bits is passed < 0 to tell that there is no zlib header.
if (deflateInit2_(&zipStream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, 8, Z_DEFAULT_STRATEGY, ZLIB_VERSION, sizeof(zipStream)) != Z_OK)
{
return false;
}
// Deflate the input stream
uint32_t readSize = 0;
uint64_t dataPendingToCompress = inputSize;
uint64_t dataPendingToWrite = 0;
bool isEndOfInput = false;
while (dataPendingToCompress > 0)
{
if (dataPendingToCompress > BUFFER_READ_SIZE)
{
readSize = BUFFER_READ_SIZE;
}
else
{
readSize = dataPendingToCompress;
isEndOfInput = true;
}
// Copy the piece of input stream to the read buffer.
std::memcpy(readBuffer.data(), &inputStream[inputSize - dataPendingToCompress], readSize);
dataPendingToCompress -= readSize;
zipStream.next_in = reinterpret_cast<Bytef *>(readBuffer.data());
zipStream.avail_in = readSize;
// While there is input data to compress.
while (zipStream.avail_in > 0)
{
// Output buffer is full.
if (zipStream.avail_out == 0)
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), dataPendingToWrite);
zipStream.total_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
dataPendingToWrite = 0;
}
uint64_t totalOutBefore = zipStream.total_out;
int zlibError = deflate(&zipStream, isEndOfInput ? Z_FINISH : Z_NO_FLUSH);
if ((zlibError != Z_OK) && (zlibError != Z_STREAM_END))
{
deflateEnd(&zipStream);
return false;
}
dataPendingToWrite += static_cast<uint64_t>(zipStream.total_out - totalOutBefore);
}
}
// Flush last compressed data.
while (dataPendingToWrite > 0)
{
if (dataPendingToWrite > BUFFER_WRITE_SIZE)
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), BUFFER_WRITE_SIZE);
}
else
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), dataPendingToWrite);
}
zipStream.total_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
uint64_t totalOutBefore = zipStream.total_out;
int zlibError = deflate(&zipStream, Z_FINISH);
if ((zlibError != Z_OK) && (zlibError != Z_STREAM_END))
{
deflateEnd(&zipStream);
return false;
}
dataPendingToWrite = static_cast<uint64_t>(zipStream.total_out - totalOutBefore);
}
deflateEnd(&zipStream);
const std::string & outputString = outputStringStream.str();
std::vector<unsigned char> deflated{outputString.begin(), outputString.end()};
std::cout << "Output String size: " << outputString.size() << std::endl;
input.swap(deflated);
return true;
}
int main(int argc, char * argv[])
{
std::ifstream input_file{"/tmp/test.txt"};
std::vector<unsigned char> data((std::istreambuf_iterator<char>(input_file)), std::istreambuf_iterator<char>());
std::cout << "Deflated: " << mydeflate(data) << '\n';
std::ofstream output_file{"/tmp/deflated.txt"};
output_file.write(reinterpret_cast<char *>(data.data()), data.size());
return 0;
}
加密++:
#include "cryptopp/files.h"
#include "cryptopp/zdeflate.h"
#include "cryptopp/zinflate.h"
void deflate_method1(const std::string & input_file_path, const std::string & output_file_path)
{
CryptoPP::Deflator deflator(new CryptoPP::FileSink(output_file_path.c_str(), true), CryptoPP::Deflator::DEFAULT_DEFLATE_LEVEL, CryptoPP::Deflator::MAX_LOG2_WINDOW_SIZE);
CryptoPP::FileSource fs(input_file_path.c_str(), true);
fs.TransferAllTo(deflator);
}
void inflate_method1(const std::string & input_file_path, const std::string & output_file_path)
{
CryptoPP::FileSource fs(input_file_path.c_str(), true);
CryptoPP::Inflator inflator(new CryptoPP::FileSink(output_file_path.c_str(), true));
fs.TransferAllTo(inflator);
}
void deflate_method2(const std::string& input_file_path, const std::string& output_file_path)
{
CryptoPP::Deflator deflator(new CryptoPP::FileSink(output_file_path.c_str(), true), CryptoPP::Deflator::DEFAULT_DEFLATE_LEVEL, 15);
std::ifstream file_in;
file_in.open(input_file_path, std::ios::binary);
std::string buffer;
size_t num_read = 0;
const size_t buffer_size(1024 * 1024);
buffer.resize(buffer_size);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
while (num_read) {
deflator.ChannelPut(CryptoPP::DEFAULT_CHANNEL, reinterpret_cast<unsigned char*>(const_cast<char *>(buffer.data())), num_read);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
}
file_in.close();
deflator.Flush(true);
}
void inflate_method2(const std::string& input_file_path, const std::string& output_file_path)
{
CryptoPP::Inflator inflator(new CryptoPP::FileSink(output_file_path.c_str(), true));
std::ifstream file_in;
file_in.open(input_file_path, std::ios::binary);
std::string buffer;
size_t num_read = 0;
const size_t buffer_size(1024 * 1024);
buffer.resize(buffer_size);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
while (num_read) {
inflator.ChannelPut(CryptoPP::DEFAULT_CHANNEL, reinterpret_cast<unsigned char*>(const_cast<char *>(buffer.data())), num_read);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
}
file_in.close();
inflator.Flush(true);
}
int main(int argc, char * argv[])
{
deflate_method1("/tmp/test.txt", "/tmp/deflated_method1.bin");
inflate_method1("/tmp/deflated_method1.bin", "/tmp/inflated_method1.txt");
deflate_method2("/tmp/test.txt", "/tmp/deflated_method2.bin");
inflate_method2("/tmp/deflated_method2.bin", "/tmp/inflated_method2.txt");
// This throws: Inflator: unexpected end of compressed block
inflate_method1("/tmp/deflated_method2.bin", "/tmp/inflated_with_method1_file_deflated_with_method2.txt");
return 0;
}
Python:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import zlib
def CHUNKSIZE():
return 128
def deflate(file_path, compression_level, method, wbits):
plain_data = None
deflated_data = bytearray()
deflator = zlib.compressobj(compression_level, method, wbits)
with open(file_path, 'rb') as input_file:
while True:
plain_data = input_file.read(CHUNKSIZE())
if not plain_data:
break
deflated_data += deflator.compress(plain_data)
deflated_data += deflator.flush()
return deflated_data
def inflate(file_path, wbits):
inflated_data = bytearray()
inflator = zlib.decompressobj(wbits)
with open(file_path, 'rb') as deflated_file:
buffer = deflated_file.read(CHUNKSIZE())
while buffer:
inflated_data += inflator.decompress(buffer)
buffer = deflated_file.read(CHUNKSIZE())
inflated_data += inflator.flush()
return inflated_data
def write_file(file_path, data):
with open(file_path, 'wb') as output_file:
output_file.write(data)
if __name__ == "__main__":
deflated_data = deflate("/tmp/test.txt", zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS)
write_file("/tmp/deflated_python.bin", deflated_data)
前三个工作正常,使用最后一个 deflate 块生成有效的 deflate 压缩流。
您的“Crypto++ method2”正在生成两个 deflate 块,其中第二个是一个空的存储块,未标记为最后一个块。这不是有效的放气流,因为它不会终止。您没有正确完成压缩。
您的 deflator.Flush(true)
正在刷新第一个块并发出空的存储块,而不会结束压缩流。
我没有看到太多文档,或者根本没有看到任何文档,但是查看源代码,我会尝试 deflator.EndBlock(true)
。
更新:
根据下面的评论,EndBlock
不是 public。相反,MessageEnd
是终止放气流所需要的。