将 pymupdf fitz 对象作为 pdf 保存到 s3
Saving a pymupdf fitz object to s3 as a pdf
我正在尝试使用 lambda 裁剪 pdf 并将其保存到具有相同名称的 s3。我收到数据类型错误 fitz.fitz.page
import os
import json
import boto3
from urllib.parse import unquote_plus
import fitz, sys
from io import BytesIO
OUTPUT_BUCKET_NAME = os.environ["OUTPUT_BUCKET_NAME"]
OUTPUT_S3_PREFIX = os.environ["OUTPUT_S3_PREFIX"]
SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"]
SNS_ROLE_ARN = os.environ["SNS_ROLE_ARN"]
def lambda_handler(event, context):
textract = boto3.client("textract")
if event:
file_obj = event["Records"][0]
bucketname = str(file_obj["s3"]["bucket"]["name"])
filename = unquote_plus(str(file_obj["s3"]["object"]["key"]))
doc = fitz.open()
s3 = boto3.resource('s3')
obj = s3.Object(bucketname, filename)
fs = obj.get()['Body'].read()
pdf=fitz.open("pdf", stream=BytesIO(fs))
#pdf.close()
rect=fitz.Rect(0.0, 0.0, 595.0, 842.0)
#page = pdf[0]
page1 = doc.new_page(width = rect.width, # new page with ...
height = rect.height)
page1.show_pdf_page(rect, pdf, 0)
print(type(doc))
print(type(page1))
s3.Bucket(bucketname).put_object(Key=filename, Body=page1)
发生这种情况是因为 page1 对象是使用 fitz.fitz.page
定义的,并且 S3 put 对象期望的类型是字节。
为了解决这个问题,您可以使用新 PDF (doc
) 的 write
函数并获取它的字节格式输出,您可以将其传递给 S3那么.
# Save fil first.
new_bytes = doc.write()
s3.Bucket(bucketname).put_object(Key=filename, Body=new_bytes)
出于某种原因,doc.write()
方法没有 return 如上所述的字节对象。这是创建新 doc
的另一种方法,使用 BytesIO
转换为字节,然后将其作为 pdf
:
保存到 s3
import fitz
from io import BytesIO
client = boto3.client("s3")
# create new doc object
single_page = fitz.open()
# insert a page from original_pdf_doc
single_page.insert_pdf(
original_pdf_doc, from_page=from_page_num, to_page=to_page_num
)
# Use BytesIO and .write() method to save to a bytes object
bytes_ = BytesIO(single_page.write())
# Upload the bytes object!
client.put_object(Body=bytes_, Bucket=bucket, Key=key)
我正在尝试使用 lambda 裁剪 pdf 并将其保存到具有相同名称的 s3。我收到数据类型错误 fitz.fitz.page
import os
import json
import boto3
from urllib.parse import unquote_plus
import fitz, sys
from io import BytesIO
OUTPUT_BUCKET_NAME = os.environ["OUTPUT_BUCKET_NAME"]
OUTPUT_S3_PREFIX = os.environ["OUTPUT_S3_PREFIX"]
SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"]
SNS_ROLE_ARN = os.environ["SNS_ROLE_ARN"]
def lambda_handler(event, context):
textract = boto3.client("textract")
if event:
file_obj = event["Records"][0]
bucketname = str(file_obj["s3"]["bucket"]["name"])
filename = unquote_plus(str(file_obj["s3"]["object"]["key"]))
doc = fitz.open()
s3 = boto3.resource('s3')
obj = s3.Object(bucketname, filename)
fs = obj.get()['Body'].read()
pdf=fitz.open("pdf", stream=BytesIO(fs))
#pdf.close()
rect=fitz.Rect(0.0, 0.0, 595.0, 842.0)
#page = pdf[0]
page1 = doc.new_page(width = rect.width, # new page with ...
height = rect.height)
page1.show_pdf_page(rect, pdf, 0)
print(type(doc))
print(type(page1))
s3.Bucket(bucketname).put_object(Key=filename, Body=page1)
发生这种情况是因为 page1 对象是使用 fitz.fitz.page
定义的,并且 S3 put 对象期望的类型是字节。
为了解决这个问题,您可以使用新 PDF (doc
) 的 write
函数并获取它的字节格式输出,您可以将其传递给 S3那么.
# Save fil first.
new_bytes = doc.write()
s3.Bucket(bucketname).put_object(Key=filename, Body=new_bytes)
出于某种原因,doc.write()
方法没有 return 如上所述的字节对象。这是创建新 doc
的另一种方法,使用 BytesIO
转换为字节,然后将其作为 pdf
:
s3
import fitz
from io import BytesIO
client = boto3.client("s3")
# create new doc object
single_page = fitz.open()
# insert a page from original_pdf_doc
single_page.insert_pdf(
original_pdf_doc, from_page=from_page_num, to_page=to_page_num
)
# Use BytesIO and .write() method to save to a bytes object
bytes_ = BytesIO(single_page.write())
# Upload the bytes object!
client.put_object(Body=bytes_, Bucket=bucket, Key=key)