如何使用 boto3 将 S3 对象保存到文件
How to save S3 object to a file using boto3
我正在尝试使用新的 boto3 AWS 客户端"hello world"。
我的用例非常简单:从 S3 获取对象并将其保存到文件中。
在 boto 2.X 我会这样做:
import boto
key = boto.connect_s3().get_bucket('foo').get_key('foo')
key.get_contents_to_filename('/tmp/foo')
在 boto 3 中。我找不到一种干净的方法来做同样的事情,所以我手动迭代 "Streaming" 对象:
import boto3
key = boto3.resource('s3').Object('fooo', 'docker/my-image.tar.gz').get()
with open('/tmp/my-image.tar.gz', 'w') as f:
chunk = key['Body'].read(1024*8)
while chunk:
f.write(chunk)
chunk = key['Body'].read(1024*8)
或
import boto3
key = boto3.resource('s3').Object('fooo', 'docker/my-image.tar.gz').get()
with open('/tmp/my-image.tar.gz', 'w') as f:
for chunk in iter(lambda: key['Body'].read(4096), b''):
f.write(chunk)
而且效果很好。我想知道是否有任何 "native" boto3 函数可以完成相同的任务?
最近 Boto3 中进行了自定义,这有助于解决这个问题(除其他外)。目前暴露在底层S3客户端,可以这样使用:
s3_client = boto3.client('s3')
open('hello.txt').write('Hello, world!')
# Upload the file to S3
s3_client.upload_file('hello.txt', 'MyBucket', 'hello-remote.txt')
# Download the file from S3
s3_client.download_file('MyBucket', 'hello-remote.txt', 'hello2.txt')
print(open('hello2.txt').read())
这些函数将自动处理 reading/writing 文件以及对大文件并行进行分段上传。
请注意,s3_client.download_file
不会创建目录。它可以创建为 pathlib.Path('/path/to/file.txt').parent.mkdir(parents=True, exist_ok=True)
.
boto3 现在有比客户端更好的界面:
resource = boto3.resource('s3')
my_bucket = resource.Bucket('MyBucket')
my_bucket.download_file(key, local_filename)
这本身并没有比接受的答案中的 client
好多少(尽管文档说它在失败时重试上传和下载做得更好)但考虑到资源通常更符合人体工程学(例如,s3 bucket and object 资源比客户端方法更好)这确实允许您留在资源层而不必下降。
Resources
通常可以用与客户端相同的方式创建,它们采用所有或大部分相同的参数,并将它们转发给它们的内部客户端。
想要模拟set_contents_from_string
like boto2方法的小伙伴们,可以试试
import boto3
from cStringIO import StringIO
s3c = boto3.client('s3')
contents = 'My string to save to S3 object'
target_bucket = 'hello-world.by.vor'
target_file = 'data/hello.txt'
fake_handle = StringIO(contents)
# notice if you do fake_handle.read() it reads like a file handle
s3c.put_object(Bucket=target_bucket, Key=target_file, Body=fake_handle.read())
对于Python3:
在python3两者StringIO and cStringIO are gone。使用 StringIO
导入,如:
from io import StringIO
要同时支持两个版本:
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
# Preface: File is json with contents: {'name': 'Android', 'status': 'ERROR'}
import boto3
import io
s3 = boto3.resource('s3')
obj = s3.Object('my-bucket', 'key-to-file.json')
data = io.BytesIO()
obj.download_fileobj(data)
# object is now a bytes string, Converting it to a dict:
new_dict = json.loads(data.getvalue().decode("utf-8"))
print(new_dict['status'])
# Should print "Error"
注意:我假设您已经单独配置了身份验证。下面的代码是从 S3 存储桶下载单个对象。
import boto3
#initiate s3 client
s3 = boto3.resource('s3')
#Download object to the file
s3.Bucket('mybucket').download_file('hello.txt', '/tmp/hello.txt')
当您想读取一个配置与默认配置不同的文件时,可以直接使用 mpu.aws.s3_download(s3path, destination)
或复制粘贴代码:
def s3_download(source, destination,
exists_strategy='raise',
profile_name=None):
"""
Copy a file from an S3 source to a local destination.
Parameters
----------
source : str
Path starting with s3://, e.g. 's3://bucket-name/key/foo.bar'
destination : str
exists_strategy : {'raise', 'replace', 'abort'}
What is done when the destination already exists?
profile_name : str, optional
AWS profile
Raises
------
botocore.exceptions.NoCredentialsError
Botocore is not able to find your credentials. Either specify
profile_name or add the environment variables AWS_ACCESS_KEY_ID,
AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN.
See https://boto3.readthedocs.io/en/latest/guide/configuration.html
"""
exists_strategies = ['raise', 'replace', 'abort']
if exists_strategy not in exists_strategies:
raise ValueError('exists_strategy \'{}\' is not in {}'
.format(exists_strategy, exists_strategies))
session = boto3.Session(profile_name=profile_name)
s3 = session.resource('s3')
bucket_name, key = _s3_path_split(source)
if os.path.isfile(destination):
if exists_strategy is 'raise':
raise RuntimeError('File \'{}\' already exists.'
.format(destination))
elif exists_strategy is 'abort':
return
s3.Bucket(bucket_name).download_file(key, destination)
from collections import namedtuple
S3Path = namedtuple("S3Path", ["bucket_name", "key"])
def _s3_path_split(s3_path):
"""
Split an S3 path into bucket and key.
Parameters
----------
s3_path : str
Returns
-------
splitted : (str, str)
(bucket, key)
Examples
--------
>>> _s3_path_split('s3://my-bucket/foo/bar.jpg')
S3Path(bucket_name='my-bucket', key='foo/bar.jpg')
"""
if not s3_path.startswith("s3://"):
raise ValueError(
"s3_path is expected to start with 's3://', " "but was {}"
.format(s3_path)
)
bucket_key = s3_path[len("s3://"):]
bucket_name, key = bucket_key.split("/", 1)
return S3Path(bucket_name, key)
如果你想下载文件的一个版本,你需要使用get_object
。
import boto3
bucket = 'bucketName'
prefix = 'path/to/file/'
filename = 'fileName.ext'
s3c = boto3.client('s3')
s3r = boto3.resource('s3')
if __name__ == '__main__':
for version in s3r.Bucket(bucket).object_versions.filter(Prefix=prefix + filename):
file = version.get()
version_id = file.get('VersionId')
obj = s3c.get_object(
Bucket=bucket,
Key=prefix + filename,
VersionId=version_id,
)
with open(f"{filename}.{version_id}", 'wb') as f:
for chunk in obj['Body'].iter_chunks(chunk_size=4096):
f.write(chunk)
参考:https://botocore.amazonaws.com/v1/documentation/api/latest/reference/response.html
我正在尝试使用新的 boto3 AWS 客户端"hello world"。
我的用例非常简单:从 S3 获取对象并将其保存到文件中。
在 boto 2.X 我会这样做:
import boto
key = boto.connect_s3().get_bucket('foo').get_key('foo')
key.get_contents_to_filename('/tmp/foo')
在 boto 3 中。我找不到一种干净的方法来做同样的事情,所以我手动迭代 "Streaming" 对象:
import boto3
key = boto3.resource('s3').Object('fooo', 'docker/my-image.tar.gz').get()
with open('/tmp/my-image.tar.gz', 'w') as f:
chunk = key['Body'].read(1024*8)
while chunk:
f.write(chunk)
chunk = key['Body'].read(1024*8)
或
import boto3
key = boto3.resource('s3').Object('fooo', 'docker/my-image.tar.gz').get()
with open('/tmp/my-image.tar.gz', 'w') as f:
for chunk in iter(lambda: key['Body'].read(4096), b''):
f.write(chunk)
而且效果很好。我想知道是否有任何 "native" boto3 函数可以完成相同的任务?
最近 Boto3 中进行了自定义,这有助于解决这个问题(除其他外)。目前暴露在底层S3客户端,可以这样使用:
s3_client = boto3.client('s3')
open('hello.txt').write('Hello, world!')
# Upload the file to S3
s3_client.upload_file('hello.txt', 'MyBucket', 'hello-remote.txt')
# Download the file from S3
s3_client.download_file('MyBucket', 'hello-remote.txt', 'hello2.txt')
print(open('hello2.txt').read())
这些函数将自动处理 reading/writing 文件以及对大文件并行进行分段上传。
请注意,s3_client.download_file
不会创建目录。它可以创建为 pathlib.Path('/path/to/file.txt').parent.mkdir(parents=True, exist_ok=True)
.
boto3 现在有比客户端更好的界面:
resource = boto3.resource('s3')
my_bucket = resource.Bucket('MyBucket')
my_bucket.download_file(key, local_filename)
这本身并没有比接受的答案中的 client
好多少(尽管文档说它在失败时重试上传和下载做得更好)但考虑到资源通常更符合人体工程学(例如,s3 bucket and object 资源比客户端方法更好)这确实允许您留在资源层而不必下降。
Resources
通常可以用与客户端相同的方式创建,它们采用所有或大部分相同的参数,并将它们转发给它们的内部客户端。
想要模拟set_contents_from_string
like boto2方法的小伙伴们,可以试试
import boto3
from cStringIO import StringIO
s3c = boto3.client('s3')
contents = 'My string to save to S3 object'
target_bucket = 'hello-world.by.vor'
target_file = 'data/hello.txt'
fake_handle = StringIO(contents)
# notice if you do fake_handle.read() it reads like a file handle
s3c.put_object(Bucket=target_bucket, Key=target_file, Body=fake_handle.read())
对于Python3:
在python3两者StringIO and cStringIO are gone。使用 StringIO
导入,如:
from io import StringIO
要同时支持两个版本:
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
# Preface: File is json with contents: {'name': 'Android', 'status': 'ERROR'}
import boto3
import io
s3 = boto3.resource('s3')
obj = s3.Object('my-bucket', 'key-to-file.json')
data = io.BytesIO()
obj.download_fileobj(data)
# object is now a bytes string, Converting it to a dict:
new_dict = json.loads(data.getvalue().decode("utf-8"))
print(new_dict['status'])
# Should print "Error"
注意:我假设您已经单独配置了身份验证。下面的代码是从 S3 存储桶下载单个对象。
import boto3
#initiate s3 client
s3 = boto3.resource('s3')
#Download object to the file
s3.Bucket('mybucket').download_file('hello.txt', '/tmp/hello.txt')
当您想读取一个配置与默认配置不同的文件时,可以直接使用 mpu.aws.s3_download(s3path, destination)
或复制粘贴代码:
def s3_download(source, destination,
exists_strategy='raise',
profile_name=None):
"""
Copy a file from an S3 source to a local destination.
Parameters
----------
source : str
Path starting with s3://, e.g. 's3://bucket-name/key/foo.bar'
destination : str
exists_strategy : {'raise', 'replace', 'abort'}
What is done when the destination already exists?
profile_name : str, optional
AWS profile
Raises
------
botocore.exceptions.NoCredentialsError
Botocore is not able to find your credentials. Either specify
profile_name or add the environment variables AWS_ACCESS_KEY_ID,
AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN.
See https://boto3.readthedocs.io/en/latest/guide/configuration.html
"""
exists_strategies = ['raise', 'replace', 'abort']
if exists_strategy not in exists_strategies:
raise ValueError('exists_strategy \'{}\' is not in {}'
.format(exists_strategy, exists_strategies))
session = boto3.Session(profile_name=profile_name)
s3 = session.resource('s3')
bucket_name, key = _s3_path_split(source)
if os.path.isfile(destination):
if exists_strategy is 'raise':
raise RuntimeError('File \'{}\' already exists.'
.format(destination))
elif exists_strategy is 'abort':
return
s3.Bucket(bucket_name).download_file(key, destination)
from collections import namedtuple
S3Path = namedtuple("S3Path", ["bucket_name", "key"])
def _s3_path_split(s3_path):
"""
Split an S3 path into bucket and key.
Parameters
----------
s3_path : str
Returns
-------
splitted : (str, str)
(bucket, key)
Examples
--------
>>> _s3_path_split('s3://my-bucket/foo/bar.jpg')
S3Path(bucket_name='my-bucket', key='foo/bar.jpg')
"""
if not s3_path.startswith("s3://"):
raise ValueError(
"s3_path is expected to start with 's3://', " "but was {}"
.format(s3_path)
)
bucket_key = s3_path[len("s3://"):]
bucket_name, key = bucket_key.split("/", 1)
return S3Path(bucket_name, key)
如果你想下载文件的一个版本,你需要使用get_object
。
import boto3
bucket = 'bucketName'
prefix = 'path/to/file/'
filename = 'fileName.ext'
s3c = boto3.client('s3')
s3r = boto3.resource('s3')
if __name__ == '__main__':
for version in s3r.Bucket(bucket).object_versions.filter(Prefix=prefix + filename):
file = version.get()
version_id = file.get('VersionId')
obj = s3c.get_object(
Bucket=bucket,
Key=prefix + filename,
VersionId=version_id,
)
with open(f"{filename}.{version_id}", 'wb') as f:
for chunk in obj['Body'].iter_chunks(chunk_size=4096):
f.write(chunk)
参考:https://botocore.amazonaws.com/v1/documentation/api/latest/reference/response.html