S3 不支持 GzipFile?

GzipFile not supported by S3?

我正在尝试遍历某些文件路径,以便对每个文件单独进行 gzip。 testList 中的每个项目都包含这样的字符串(路径):/tmp/File.

对它们进行 gzip 压缩后,我想将每个 gzip 文件上传到 S3:

import boto3
import gzip
import shutil


s3 = boto3.client('s3')
bucket = s3_resource.Bucket('testunzipping')

with zipfile.ZipFile('/tmp/DataPump_10000838.zip', 'r') as zip_ref:
    testList = []
    for i in zip_ref.namelist():
        if (i.startswith("__MACOSX/") == False):
            val = '/tmp/'+i
            testList.append(val)
            
    testList.remove(testList[0])

    for i in testList:
        fileName = i.replace("/tmp/DataPump_10000838/", "") 
        fileName2 = i + '.gz'
        with open(i, 'rb') as f_in:
            with gzip.open(fileName2, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
            gzip_object = gzip.compress(f_out)
            bucket.upload_fileobj(f_out, fileName, ExtraArgs={'ContentType': "text/plain", 'ContentEncoding':'gzip'})

但是,目前,最后一行给我这个错误:

Response
{
  "errorMessage": "Input <gzip on 0x7fd53bc53fa0> of type: <class 'gzip.GzipFile'> is not supported.",
  "errorType": "RuntimeError",
  "requestId": "",
  "stackTrace": [
    "  File \"/var/lang/lib/python3.9/importlib/__init__.py\", line 127, in import_module\n    return _bootstrap._gcd_import(name[level:], package, level)\n",
    "  File \"<frozen importlib._bootstrap>\", line 1030, in _gcd_import\n",
    "  File \"<frozen importlib._bootstrap>\", line 1007, in _find_and_load\n",
    "  File \"<frozen importlib._bootstrap>\", line 986, in _find_and_load_unlocked\n",
    "  File \"<frozen importlib._bootstrap>\", line 680, in _load_unlocked\n",
    "  File \"<frozen importlib._bootstrap_external>\", line 850, in exec_module\n",
    "  File \"<frozen importlib._bootstrap>\", line 228, in _call_with_frames_removed\n",
    "  File \"/var/task/lambda_function.py\", line 50, in <module>\n    bucket.upload_fileobj(f_out, fileName, ExtraArgs={'ContentType': \"text/plain\", 'ContentEncoding':'gzip'})\n",
    "  File \"/var/runtime/boto3/s3/inject.py\", line 579, in bucket_upload_fileobj\n    return self.meta.client.upload_fileobj(\n",
    "  File \"/var/runtime/boto3/s3/inject.py\", line 539, in upload_fileobj\n    return future.result()\n",
    "  File \"/var/runtime/s3transfer/futures.py\", line 106, in result\n    return self._coordinator.result()\n",
    "  File \"/var/runtime/s3transfer/futures.py\", line 265, in result\n    raise self._exception\n",
    "  File \"/var/runtime/s3transfer/tasks.py\", line 255, in _main\n    self._submit(transfer_future=transfer_future, **kwargs)\n",
    "  File \"/var/runtime/s3transfer/upload.py\", line 545, in _submit\n    upload_input_manager = self._get_upload_input_manager_cls(\n",
    "  File \"/var/runtime/s3transfer/upload.py\", line 521, in _get_upload_input_manager_cls\n    raise RuntimeError(\n"
  ]
}

我还能如何将 f_out 对象上传到 S3 存储桶? S3/boto 不支持 gzip 吗?我也试过ExtraArgs={'ContentType': "application/gzip"}但得到了同样的错误。

假设每个文件都可以放入内存,您可以简单地执行此操作以压缩内存中的数据并将其打包到 BytesIO 中以供 S3 API 读取。

import boto3
import gzip
import io


s3 = boto3.client("s3")
bucket = s3_resource.Bucket("testunzipping")
for i in testList:
    fileName = i.replace("/tmp/DataPump_10000838/", "")
    with open(i, "rb") as f_in:
        gzipped_content = gzip.compress(f_in.read())
        bucket.upload_fileobj(
            io.BytesIO(gzipped_content),
            fileName,
            ExtraArgs={"ContentType": "text/plain", "ContentEncoding": "gzip"},
        )

如果不是这样,您可以先使用临时文件将数据压缩到磁盘上:

import boto3
import gzip
import io
import shutil


s3 = boto3.client("s3")
bucket = s3_resource.Bucket("testunzipping")
for i in testList:
    fileName = i.replace("/tmp/DataPump_10000838/", "")
    with tempfile.TemporaryFile() as tmpf:
        with open(i, "rb") as f_in, gzip.GzipFile(mode="wb", fileobj=tmpf) as gzf:
            shutil.copyfileobj(f_in, gzf)
        tmpf.seek(0)
        bucket.upload_fileobj(
            tmpf,
            fileName,
            ExtraArgs={"ContentType": "text/plain", "ContentEncoding": "gzip"},
        )

与其将文件加载到 /tmp 文件夹,不如将其读入缓冲区,因为 /tmp 文件夹的内存有限。

buffer = BytesIO(file.get()["Body"].read())
zipped = zipfile.ZipFile(buffer)

对于 gzip 压缩,您可以简单地使用这样的东西:

with zipped.open(file, "r") as f_in:
gzipped_content = gzip.compress(f_in.read())
                destinationbucket.upload_fileobj(io.BytesIO(gzipped_content),
                                                        final_file_path,
                                                        ExtraArgs={"ContentType": "text/plain"}
                                                )

这里有关于同一件事的完整教程:https://medium.com/p/f7bccf0099c9

在上传之前将其压缩,然后使用文本作为 ContentType