分段上传到 Amazon Glacier:Content-Range 与 Content-Length 不兼容

Multipart upload to Amazon Glacier: Content-Range incompatible with Content-Length

我正在尝试将大约 1gb 的文件上传到 Amazon Glacier。有点武断,我决定把它分成32mb的部分并连续上传。

import math
import boto3
from botocore.utils import calculate_tree_hash

client = boto3.client('glacier')
vault_name = 'my-vault'
size = 1073745600 # in bytes
size_mb = size / (2**20) # Convert to megabytes for readability
local_file = 'filename'

multi_up = client.initiate_multipart_upload(vaultName=vault_name,
                                        archiveDescription=local_file,
                                        partSize=str(2**25)) # 32 mb in bytes
parts = math.floor(size_mb / 32)
with open("/Users/alexchase/Desktop/{}".format(local_file), 'rb') as upload:
    for p in range(parts):
        # Calculate lower and upper bounds for the byte ranges. The last range
        # is bigger than the ones that come before.
        lower = (p * (2**25))
        upper = (((p + 1) * (2**25)) - 1) if (p + 1 < parts) else (size)
        up_part = client.upload_multipart_part(vaultName=vault_name,
                                           uploadId=multi_up['uploadId'],
                                           range='bytes {}-{}/*'.format(lower, upper),
                                           body=upload)
checksum = calculate_tree_hash(upload)
complete_up = client.complete_multipart_upload(archiveSize=str(size),
                                               checksum=checksum,
                                               uploadId=multi_up['uploadId'],
                                               vaultName=vault_name)

这会生成有关第一个字节范围的错误。

---------------------------------------------------------------------------
InvalidParameterValueException            Traceback (most recent call last)
<ipython-input-2-9dd3ac986601> in <module>()
     93                         uploadId=multi_up['uploadId'],
     94                         range='bytes {}-{}/*'.format(lower, upper),
---> 95                         body=upload)
     96                     upload_info.append(up_part)
     97                 checksum = calculate_tree_hash(upload)

~/anaconda/lib/python3.5/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
    251                     "%s() only accepts keyword arguments." % py_operation_name)
    252             # The "self" in this scope is referring to the BaseClient.
--> 253             return self._make_api_call(operation_name, kwargs)
    254
    255         _api_call.__name__ = str(py_operation_name)

~/anaconda/lib/python3.5/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
    555             error_code = parsed_response.get("Error", {}).get("Code")
    556             error_class = self.exceptions.from_code(error_code)
--> 557             raise error_class(parsed_response, operation_name)
    558         else:
    559             return parsed_response

InvalidParameterValueException: An error occurred (InvalidParameterValueException) when calling the UploadMultipartPart operation:
Content-Range: bytes 0-33554431/* is incompatible with Content-Length: 1073745600

谁能看出我做错了什么?

Content-Range: bytes 0-33554431/* is incompatible with Content-Length: 1073745600

您告诉 API 您正在发送前 32 MiB,但实际上您正在发送(建议发送)整个文件,因为 body=uploadupload 不只是第一部分,它是整个文件。 Content-Length指的是这部分上传的大小,应该是33554432(32MiB)。

docs 确实是模棱两可的...

body (bytes or seekable file-like object) -- The data to upload.

...但是 "data to upload" 似乎只引用这部分的数据,尽管 "seekable."

这个词

@Michael-sqlbot 非常正确,Content-Range 的问题是我传递的是整个文件而不是一部分。我使用 read() 方法解决了这个问题,但后来我发现了一个单独的问题,即(根据 docs),最后一部分必须与前面的部分大小相同或更小。这意味着使用 math.ceil() 而不是 math.floor() 来定义零件数。

工作代码是:

import math
import boto3
from botocore.utils import calculate_tree_hash

client = boto3.client('glacier')
vault_name = 'my-vault'
size = 1073745600 # in bytes
size_mb = size / (2**20) # Convert to megabytes for readability
local_file = 'filename'
partSize=(2**25)

multi_up = client.initiate_multipart_upload(vaultName=vault_name,
                                        archiveDescription=local_file,
                                        partSize=str(partSize)) # 32 mb in bytes
parts = math.ceil(size_mb / 32) # The number of <=32mb parts we need
with open("/Users/alexchase/Desktop/{}".format(local_file), 'rb') as upload:
    for p in range(parts):
        # Calculate lower and upper bounds for the byte ranges. The last range
        # is now smaller than the ones that come before.
        lower = (p * (partSize))
        upper = (((p + 1) * (partSize)) - 1) if (p + 1 < parts) else (size-1)
        read_size = upper-lower+1
        file_part = upload.read(read_size)
        up_part = client.upload_multipart_part(vaultName=vault_name,
                                           uploadId=multi_up['uploadId'],
                                           range='bytes {}-{}/*'.format(lower, upper),
                                           body=file_part)
checksum = calculate_tree_hash(upload)
complete_up = client.complete_multipart_upload(archiveSize=str(size),
                                               checksum=checksum,
                                               uploadId=multi_up['uploadId'],
                                               vaultName=vault_name)

由于 Alex 的后续回答声称它 "works",我正在 posting 另一个适用于 Python 3.5 和 Ubuntu 16.04 的版本。我还从我们的生产端到端解决方案中添加了一些环境变量。

原来的 post 给了我一个错误,所以我对其进行了调整并进行了一些清理。希望这可以帮助需要此 Glacier 功能的人。使用带有 awscli 命令的 Shell 脚本不是那么干净。

import math
import boto3
import os
from botocore.utils import calculate_tree_hash

vault_name = os.getenv('GLACIER_VAULT_NAME')
file_name = os.getenv('GLACIER_UPLOAD_FILE')

if vault_name is None:
    print('GLACIER_VAULT_NAME environment variable is required. Exiting.')
    exit(1)
if file_name is None:
    print('GLACIER_UPLOAD_FILE environment variable is required. Exiting.')
    exit(2)

chunk_size = 2 ** 25
client = boto3.client('glacier')

client.create_vault(vaultName=vault_name)

upload_obj = client.initiate_multipart_upload(vaultName=vault_name,
                                              archiveDescription=file_name,
                                              partSize=str(chunk_size))
file_size = os.path.getsize(file_name)
parts = math.ceil(file_size / chunk_size)

with open(file_name, 'rb') as upload:
    for p in range(parts):
        lower = p * chunk_size
        upper = lower + chunk_size - 1

        if upper > file_size:
            upper = (file_size - lower) + lower - 1

        file_part = upload.read(chunk_size)

        up_part = client.upload_multipart_part(vaultName=vault_name,
                                               uploadId=upload_obj['uploadId'],
                                               range='bytes {}-{}/{}'.format(lower,
                                                                             upper,
                                                                             file_size),
                                               body=file_part)

# this needs a new file handler because calculate_tree_hash() processes 
# the handler in a similar way to the loop above
checksum = calculate_tree_hash(open(file_name, 'rb'))
complete_up = client.complete_multipart_upload(vaultName=vault_name,
                                               uploadId=upload_obj['uploadId'],
                                               archiveSize=str(file_size),
                                               checksum=checksum)

print(complete_up)