从多个文件字节流在内存中创建一个 tar 流

Question

我正在尝试在内存中创建一个 tar 流，向其中添加文件，然后将其保存到 S3。但是存在一些问题，ta 中的文件大小为零。任何人都可以建议吗？下面的代码片段-

def tar_and_upload(bucket, keys, dest_bucket):
    s3 = boto3.client('s3')
    file_obj = io.BytesIO()
    tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)    
    response = {}
    for key in keys:
        obj = s3.get_object(Bucket=bucket, Key=key)
        _bytes = obj["Body"].read()
        _file_name = key.split("/")[-1]
        tar_file_obj.addfile(tarfile.TarInfo(_file_name), _bytes)
    tar_file_obj.close()
    try:
        obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
        s3.put_object(Body=file_obj.getvalue(), Bucket=dest_bucket, Key=obj_name)
    except Exception as e:
        logging.error("Can't save tar to S3", exc_info=True)
        return

Answer 1

显然，在将字节流添加到 tar 时，我们需要明确指定大小。示例代码-

import tarfile
import uuid
import io
import os

def tar_and_upload():
    file_obj = io.BytesIO()
    tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)
    for filename in os.listdir("images"):
      print(filename)
      file_path = os.path.join("images", filename)
      #tar_file_obj.add(file_path)
      with open(file_path, "rb") as f:
        _bytes = f.read()
        tar_info = tarfile.TarInfo(filename)
        tar_info.size = len(_bytes)
        tar_file_obj.addfile(tar_info, io.BytesIO(_bytes))
    tar_file_obj.close()
    try:
        obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
        object_path = os.path.join("temp", obj_name)
        with open(object_path, "wb") as f:
          f.write(file_obj.getvalue())
        print(obj_name)
    except Exception as e:
        print(str(e))

if __name__ == "__main__":
    tar_and_upload()

Answer 2

def tar_and_upload(bucket, keys, dest_bucket):
    s3 = boto3.client('s3')
    file_obj = io.BytesIO()
    tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)    
    response = {}
    for key in keys:
        obj = s3.get_object(Bucket=bucket, Key=key)
        _bytes = obj["Body"].read()
        _file_name = key.split("/")[-1]
        info = tarfile.TarInfo(_file_name)
        info.size = obj["ContentLength"]
        info.mtime = s3.head_object(Bucket=bucket, Key=key)['LastModified'].timestamp()
        tar_file_obj.addfile(info, io.BytesIO(_bytes))
    tar_file_obj.close()
    try:
        obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
        s3.put_object(Body=file_obj.getvalue(), Bucket=dest_bucket, Key=obj_name)
    except Exception as e:
        logging.error("Can't save tar to S3", exc_info=True)
        return

对于其他人，希望对 s3 对象执行相同的操作

从多个文件字节流在内存中创建一个 tar 流

Creating a tar stream in memory from multiple file byte streams

python

tar

amazon-s3

tarfile