使用 python 在内存中(即时)创建包含大量小文件的大型 zip 存档
create big zip archives with lot of small files in memory (on the fly) with python
任务是:
- 从 S3 存储中一个一个地读取多个文件
- 将文件添加到
big_archive.zip
- 存储
big_archive.zip
在 S3 存储
问题:
当我们将新文件附加到 zip 存档时,zip 库会更改当前存档(更新元信息),然后添加文件内容(字节)。
因为存档很大,我们需要将它按块存储到 S3 存储中。但!已经存储的块无法重写。因此我们无法更新元信息。
这段代码解释了问题:
from io import BytesIO
import zipfile, sys, gc
files = (
'input/i_1.docx', # one file size is about ~500KB
'input/i_2.docx',
...
'input/i_11.docx',
'input/i_12.docx',
'input/i_13.docx',
'input/i_14.docx'
)
# this function allow to get size of in-memory object
# thanks to
# https://towardsdatascience.com/the-strange-size-of-python-objects-in-memory-ce87bdfbb97f
def _get_size(input_obj):
memory_size = 0
ids = set()
objects = [input_obj]
while objects:
new = []
for obj in objects:
if id(obj) not in ids:
ids.add(id(obj))
memory_size += sys.getsizeof(obj)
new.append(obj)
objects = gc.get_referents(*new)
return memory_size
# open in-memory object
with BytesIO() as zip_obj_in_memory:
# open zip archive on disk
with open('tmp.zip', 'wb') as resulted_file:
# set chunk size to 1MB
chunk_max_size = 1048576 # 1MB
# iterate over files
for f in files:
# get size of in-memory object
current_size = _get_size(zip_obj_in_memory)
# if size of in-memory object is bigger than 1MB
# we need to drop it to S3 storage
if current_size > chunk_max_size:
# write file on disk (that is no matter what storge is: S3 or disk)
resulted_file.write(zip_obj_in_memory.getvalue())
# remove current in-memory data
zip_obj_in_memory.seek(0)
# zip_obj_in_memory size is 0MB after truncate so we able to adding new files
zip_obj_in_memory.truncate()
# main process open ip_obj_in_memory object in append mode and append new files
with zipfile.ZipFile(zip_obj_in_memory, 'a', compression=zipfile.ZIP_DEFLATED) as zf:
# read file and write it to archive
with open(f, 'rb') as o:
zf.writestr(
zinfo_or_arcname=f.replace('input/', 'output/'),
data=o.read()
)
# write last chunk of data
resulted_file.write(zip_obj_in_memory.getvalue())
现在尝试获取存档中的文件:
unzip -l tmp.zip
Archive: tmp.zip
warning [tmp.zip]: 6987483 extra bytes at beginning or within zipfile
(attempting to process anyway)
Length Date Time Name
--------- ---------- ----- ----
583340 12-15-2021 18:43 output/i_13.docx
583335 12-15-2021 18:43 output/i_14.docx
--------- -------
1166675 2 files
我们可以看到只显示了最后 1MB 块
让我们修复这个存档:
zip -FF tmp.zip --out fixed.zip
Fix archive (-FF) - salvage what can
Found end record (EOCDR) - says expect single disk archive
Scanning for entries...
copying: output/i_1.docx (582169 bytes)
copying: output/i_2.docx (582152 bytes)
Central Directory found...
EOCDR found ( 1 1164533)...
copying: output/i_3.docx (582175 bytes)
Entry after central directory found ( 1 1164555)...
copying: output/i_4.docx (582175 bytes)
Central Directory found...
EOCDR found ( 1 2329117)...
copying: output/i_5.docx (582176 bytes)
Entry after central directory found ( 1 2329139)...
copying: output/i_6.docx (582180 bytes)
Central Directory found...
EOCDR found ( 1 3493707)...
copying: output/i_7.docx (582170 bytes)
Entry after central directory found ( 1 3493729)...
copying: output/i_8.docx (582174 bytes)
Central Directory found...
...
之后:
unzip -l fixed.zip
Archive: fixed.zip
Length Date Time Name
--------- ---------- ----- ----
583344 12-15-2021 18:43 output/i_1.docx
583337 12-15-2021 18:43 output/i_2.docx
583346 12-15-2021 18:43 output/i_3.docx
583352 12-15-2021 18:43 output/i_4.docx
583361 12-15-2021 18:43 output/i_5.docx
583368 12-15-2021 18:43 output/i_6.docx
583356 12-15-2021 18:43 output/i_7.docx
583362 12-15-2021 18:43 output/i_8.docx
583337 12-15-2021 18:43 output/i_9.docx
583352 12-15-2021 18:43 output/i_10.docx
583363 12-15-2021 18:43 output/i_11.docx
583368 12-15-2021 18:43 output/i_12.docx
583340 12-15-2021 18:43 output/i_13.docx
583335 12-15-2021 18:43 output/i_14.docx
--------- -------
8166921 14 files
文件提取也正常。
文件内容正确。
需要的元信息存储在Central directory (CD)
所以我们需要在每个文件追加时删除 Central directory
信息(在将文件存储到磁盘(或 S3)之前),最后手动添加关于所有文件的正确信息。
可能吗?如果是,该怎么做。
至少这里有任何方法可以在人类可读的二进制模式下区分 tmp.zip 和 fixed.zip,以便能够检查 CD 的存储位置和格式。
也欢迎任何可以帮助解决此问题的 ZIP 的确切参考。
好的,我终于创建了那个 zip 的科学怪人:
from io import BytesIO
from zipfile import ZipFile, ZIP_DEFLATED
import sys
import gc
files = (
'input/i_1.docx', # one file size is about ~580KB
'input/i_2.docx',
'input/i_3.docx',
'input/i_4.docx',
'input/i_5.docx',
'input/i_6.docx',
'input/i_7.docx',
'input/i_8.docx',
'input/i_9.docx',
'input/i_10.docx',
'input/i_11.docx',
'input/i_12.docx',
'input/i_13.docx',
'input/i_14.docx',
'input/i_21.docx'
)
# this function allow to get size of in-memory object
# add only for debug purposes
def _get_size(input_obj):
memory_size = 0
ids = set()
objects = [input_obj]
while objects:
new = []
for obj in objects:
if id(obj) not in ids:
ids.add(id(obj))
memory_size += sys.getsizeof(obj)
new.append(obj)
objects = gc.get_referents(*new)
return memory_size
class CustomizedZipFile(ZipFile):
# add customized BytesIO to be able return faked offset
class _CustomizedBytesIO(BytesIO):
def __init__(self, fake_offset: int):
self.fake_offset = fake_offset
self.temporary_switch_to_faked_offset = False
super().__init__()
def tell(self):
if self.temporary_switch_to_faked_offset:
# revert tell method to normal mode to minimize faked behaviour
self.temporary_switch_to_faked_offset = False
return super().tell() + self.fake_offset
else:
return super().tell()
def __init__(self, *args, **kwargs):
# create empty file to write if fake offset is set
if 'fake_offset' in kwargs and kwargs['fake_offset'] is not None and kwargs['fake_offset'] > 0:
self._fake_offset = kwargs['fake_offset']
del kwargs['fake_offset']
if 'file' in kwargs:
kwargs['file'] = self._CustomizedBytesIO(self._fake_offset)
else:
args = list(args)
args[0] = self._CustomizedBytesIO(self._fake_offset)
else:
self._fake_offset = 0
super().__init__(*args, **kwargs)
# finalize zip (should be run only on last chunk)
def force_write_end_record(self):
self._write_end_record(False)
# don't write end record by default to be able get not ended chunks
# ZipFile writing end metainfo on close by default
def _write_end_record(self, skip_write_end=True):
if not skip_write_end:
if self._fake_offset > 0:
self.start_dir = self._fake_offset
self.fp.temporary_switch_to_faked_offset = True
super()._write_end_record()
def archive(files):
compression_type = ZIP_DEFLATED
CHUNK_SIZE = 1048576 # 1MB
with open('tmp.zip', 'wb') as resulted_file:
offset = 0
filelist = []
with BytesIO() as chunk:
for f in files:
with BytesIO() as tmp:
with CustomizedZipFile(tmp, 'w', compression=compression_type) as zf:
with open(f, 'rb') as b:
zf.writestr(
zinfo_or_arcname=f.replace('input/', 'output/'),
data=b.read()
)
zf.filelist[0].header_offset = offset
data = tmp.getvalue()
offset = offset + len(data)
filelist.append(zf.filelist[0])
chunk.write(data)
print('size of zipfile:', _get_size(zf))
print('size of chunk:', _get_size(chunk))
if len(chunk.getvalue()) > CHUNK_SIZE:
resulted_file.write(chunk.getvalue())
chunk.seek(0)
chunk.truncate()
# write last chunk
resulted_file.write(chunk.getvalue())
# file parameter may be skipped it we using fake_offset
# because empty _CustomizedBytesIO will be initialized at constructor
with CustomizedZipFile(None, 'w', compression=compression_type, fake_offset=offset) as zf:
zf.filelist = filelist
zf.force_write_end_record()
end_data = zf.fp.getvalue()
resulted_file.write(end_data)
archive(files)
输出为:
size of zipfile: 2182955
size of chunk: 582336
size of zipfile: 2182979
size of chunk: 1164533
size of zipfile: 2182983
size of chunk: 582342
size of zipfile: 2182979
size of chunk: 1164562
size of zipfile: 2182983
size of chunk: 582343
size of zipfile: 2182979
size of chunk: 1164568
size of zipfile: 2182983
size of chunk: 582337
size of zipfile: 2182983
size of chunk: 1164556
size of zipfile: 2182983
size of chunk: 582329
size of zipfile: 2182984
size of chunk: 1164543
size of zipfile: 2182984
size of chunk: 582355
size of zipfile: 2182984
size of chunk: 1164586
size of zipfile: 2182984
size of chunk: 582338
size of zipfile: 2182984
size of chunk: 1164545
size of zipfile: 2182980
size of chunk: 582320
所以我们可以看到块总是转储到存储中并在达到最大块大小(在我的例子中为 1MB)时被截断
使用 MacOS The Unarchiver v4.2.4 和 Windows 10 默认 unarchiver 和 7-zip
测试的结果存档
注意!
块存档创建的大小比普通 zipfile
库创建的存档大 16 个字节。可能在某处写入了一些额外的 zero
字节。我没有检查为什么会这样
zipfile
是我见过的最差的 python 库。看起来它应该用作不可扩展的类二进制文件
任务是:
- 从 S3 存储中一个一个地读取多个文件
- 将文件添加到
big_archive.zip
- 存储
big_archive.zip
在 S3 存储
问题:
当我们将新文件附加到 zip 存档时,zip 库会更改当前存档(更新元信息),然后添加文件内容(字节)。 因为存档很大,我们需要将它按块存储到 S3 存储中。但!已经存储的块无法重写。因此我们无法更新元信息。
这段代码解释了问题:
from io import BytesIO
import zipfile, sys, gc
files = (
'input/i_1.docx', # one file size is about ~500KB
'input/i_2.docx',
...
'input/i_11.docx',
'input/i_12.docx',
'input/i_13.docx',
'input/i_14.docx'
)
# this function allow to get size of in-memory object
# thanks to
# https://towardsdatascience.com/the-strange-size-of-python-objects-in-memory-ce87bdfbb97f
def _get_size(input_obj):
memory_size = 0
ids = set()
objects = [input_obj]
while objects:
new = []
for obj in objects:
if id(obj) not in ids:
ids.add(id(obj))
memory_size += sys.getsizeof(obj)
new.append(obj)
objects = gc.get_referents(*new)
return memory_size
# open in-memory object
with BytesIO() as zip_obj_in_memory:
# open zip archive on disk
with open('tmp.zip', 'wb') as resulted_file:
# set chunk size to 1MB
chunk_max_size = 1048576 # 1MB
# iterate over files
for f in files:
# get size of in-memory object
current_size = _get_size(zip_obj_in_memory)
# if size of in-memory object is bigger than 1MB
# we need to drop it to S3 storage
if current_size > chunk_max_size:
# write file on disk (that is no matter what storge is: S3 or disk)
resulted_file.write(zip_obj_in_memory.getvalue())
# remove current in-memory data
zip_obj_in_memory.seek(0)
# zip_obj_in_memory size is 0MB after truncate so we able to adding new files
zip_obj_in_memory.truncate()
# main process open ip_obj_in_memory object in append mode and append new files
with zipfile.ZipFile(zip_obj_in_memory, 'a', compression=zipfile.ZIP_DEFLATED) as zf:
# read file and write it to archive
with open(f, 'rb') as o:
zf.writestr(
zinfo_or_arcname=f.replace('input/', 'output/'),
data=o.read()
)
# write last chunk of data
resulted_file.write(zip_obj_in_memory.getvalue())
现在尝试获取存档中的文件:
unzip -l tmp.zip
Archive: tmp.zip
warning [tmp.zip]: 6987483 extra bytes at beginning or within zipfile
(attempting to process anyway)
Length Date Time Name
--------- ---------- ----- ----
583340 12-15-2021 18:43 output/i_13.docx
583335 12-15-2021 18:43 output/i_14.docx
--------- -------
1166675 2 files
我们可以看到只显示了最后 1MB 块
让我们修复这个存档:
zip -FF tmp.zip --out fixed.zip
Fix archive (-FF) - salvage what can
Found end record (EOCDR) - says expect single disk archive
Scanning for entries...
copying: output/i_1.docx (582169 bytes)
copying: output/i_2.docx (582152 bytes)
Central Directory found...
EOCDR found ( 1 1164533)...
copying: output/i_3.docx (582175 bytes)
Entry after central directory found ( 1 1164555)...
copying: output/i_4.docx (582175 bytes)
Central Directory found...
EOCDR found ( 1 2329117)...
copying: output/i_5.docx (582176 bytes)
Entry after central directory found ( 1 2329139)...
copying: output/i_6.docx (582180 bytes)
Central Directory found...
EOCDR found ( 1 3493707)...
copying: output/i_7.docx (582170 bytes)
Entry after central directory found ( 1 3493729)...
copying: output/i_8.docx (582174 bytes)
Central Directory found...
...
之后:
unzip -l fixed.zip
Archive: fixed.zip
Length Date Time Name
--------- ---------- ----- ----
583344 12-15-2021 18:43 output/i_1.docx
583337 12-15-2021 18:43 output/i_2.docx
583346 12-15-2021 18:43 output/i_3.docx
583352 12-15-2021 18:43 output/i_4.docx
583361 12-15-2021 18:43 output/i_5.docx
583368 12-15-2021 18:43 output/i_6.docx
583356 12-15-2021 18:43 output/i_7.docx
583362 12-15-2021 18:43 output/i_8.docx
583337 12-15-2021 18:43 output/i_9.docx
583352 12-15-2021 18:43 output/i_10.docx
583363 12-15-2021 18:43 output/i_11.docx
583368 12-15-2021 18:43 output/i_12.docx
583340 12-15-2021 18:43 output/i_13.docx
583335 12-15-2021 18:43 output/i_14.docx
--------- -------
8166921 14 files
文件提取也正常。
文件内容正确。
需要的元信息存储在Central directory (CD)
所以我们需要在每个文件追加时删除 Central directory
信息(在将文件存储到磁盘(或 S3)之前),最后手动添加关于所有文件的正确信息。
可能吗?如果是,该怎么做。
至少这里有任何方法可以在人类可读的二进制模式下区分 tmp.zip 和 fixed.zip,以便能够检查 CD 的存储位置和格式。
也欢迎任何可以帮助解决此问题的 ZIP 的确切参考。
好的,我终于创建了那个 zip 的科学怪人:
from io import BytesIO
from zipfile import ZipFile, ZIP_DEFLATED
import sys
import gc
files = (
'input/i_1.docx', # one file size is about ~580KB
'input/i_2.docx',
'input/i_3.docx',
'input/i_4.docx',
'input/i_5.docx',
'input/i_6.docx',
'input/i_7.docx',
'input/i_8.docx',
'input/i_9.docx',
'input/i_10.docx',
'input/i_11.docx',
'input/i_12.docx',
'input/i_13.docx',
'input/i_14.docx',
'input/i_21.docx'
)
# this function allow to get size of in-memory object
# add only for debug purposes
def _get_size(input_obj):
memory_size = 0
ids = set()
objects = [input_obj]
while objects:
new = []
for obj in objects:
if id(obj) not in ids:
ids.add(id(obj))
memory_size += sys.getsizeof(obj)
new.append(obj)
objects = gc.get_referents(*new)
return memory_size
class CustomizedZipFile(ZipFile):
# add customized BytesIO to be able return faked offset
class _CustomizedBytesIO(BytesIO):
def __init__(self, fake_offset: int):
self.fake_offset = fake_offset
self.temporary_switch_to_faked_offset = False
super().__init__()
def tell(self):
if self.temporary_switch_to_faked_offset:
# revert tell method to normal mode to minimize faked behaviour
self.temporary_switch_to_faked_offset = False
return super().tell() + self.fake_offset
else:
return super().tell()
def __init__(self, *args, **kwargs):
# create empty file to write if fake offset is set
if 'fake_offset' in kwargs and kwargs['fake_offset'] is not None and kwargs['fake_offset'] > 0:
self._fake_offset = kwargs['fake_offset']
del kwargs['fake_offset']
if 'file' in kwargs:
kwargs['file'] = self._CustomizedBytesIO(self._fake_offset)
else:
args = list(args)
args[0] = self._CustomizedBytesIO(self._fake_offset)
else:
self._fake_offset = 0
super().__init__(*args, **kwargs)
# finalize zip (should be run only on last chunk)
def force_write_end_record(self):
self._write_end_record(False)
# don't write end record by default to be able get not ended chunks
# ZipFile writing end metainfo on close by default
def _write_end_record(self, skip_write_end=True):
if not skip_write_end:
if self._fake_offset > 0:
self.start_dir = self._fake_offset
self.fp.temporary_switch_to_faked_offset = True
super()._write_end_record()
def archive(files):
compression_type = ZIP_DEFLATED
CHUNK_SIZE = 1048576 # 1MB
with open('tmp.zip', 'wb') as resulted_file:
offset = 0
filelist = []
with BytesIO() as chunk:
for f in files:
with BytesIO() as tmp:
with CustomizedZipFile(tmp, 'w', compression=compression_type) as zf:
with open(f, 'rb') as b:
zf.writestr(
zinfo_or_arcname=f.replace('input/', 'output/'),
data=b.read()
)
zf.filelist[0].header_offset = offset
data = tmp.getvalue()
offset = offset + len(data)
filelist.append(zf.filelist[0])
chunk.write(data)
print('size of zipfile:', _get_size(zf))
print('size of chunk:', _get_size(chunk))
if len(chunk.getvalue()) > CHUNK_SIZE:
resulted_file.write(chunk.getvalue())
chunk.seek(0)
chunk.truncate()
# write last chunk
resulted_file.write(chunk.getvalue())
# file parameter may be skipped it we using fake_offset
# because empty _CustomizedBytesIO will be initialized at constructor
with CustomizedZipFile(None, 'w', compression=compression_type, fake_offset=offset) as zf:
zf.filelist = filelist
zf.force_write_end_record()
end_data = zf.fp.getvalue()
resulted_file.write(end_data)
archive(files)
输出为:
size of zipfile: 2182955
size of chunk: 582336
size of zipfile: 2182979
size of chunk: 1164533
size of zipfile: 2182983
size of chunk: 582342
size of zipfile: 2182979
size of chunk: 1164562
size of zipfile: 2182983
size of chunk: 582343
size of zipfile: 2182979
size of chunk: 1164568
size of zipfile: 2182983
size of chunk: 582337
size of zipfile: 2182983
size of chunk: 1164556
size of zipfile: 2182983
size of chunk: 582329
size of zipfile: 2182984
size of chunk: 1164543
size of zipfile: 2182984
size of chunk: 582355
size of zipfile: 2182984
size of chunk: 1164586
size of zipfile: 2182984
size of chunk: 582338
size of zipfile: 2182984
size of chunk: 1164545
size of zipfile: 2182980
size of chunk: 582320
所以我们可以看到块总是转储到存储中并在达到最大块大小(在我的例子中为 1MB)时被截断
使用 MacOS The Unarchiver v4.2.4 和 Windows 10 默认 unarchiver 和 7-zip
测试的结果存档注意!
块存档创建的大小比普通 zipfile
库创建的存档大 16 个字节。可能在某处写入了一些额外的 zero
字节。我没有检查为什么会这样
zipfile
是我见过的最差的 python 库。看起来它应该用作不可扩展的类二进制文件