python 的 PDF 数据流
PDF data stream with python
上下文: 我的代码从一些 png 文档中获取一组坐标,然后在某些字段中执行一些编辑(它使用这些坐标在某些区域绘制矩形)。
我希望我的最终输出是一个 pdf 文件,每个经过编辑的图像作为页面。我可以毫无问题地使用 fpdf 包实现此目的。
但是,我打算将此 pdf 文件作为电子邮件(base64 编码)附件发送。有没有办法从 fpdf 输出中获取 base64 字符串?
最重要的是,我可以在 fpdf 图像方法中使用图像二进制字符串吗?
请参阅下面的 redact_pdf 方法(我在此处放置了一些注释以便更清楚)
代码:
class Redaction:
def __init__(self,png_image_list,df_coordinates):
self.png_image_list = png_image_list
self.df_coordinates = df_coordinates
def _redact_images(self):
redacted_images_bin = []
for page_num,page_data in enumerate(self.png_image_list):
im_page = Image.open(io.BytesIO(page_data))
draw = ImageDraw.Draw(im_page)
df_filtered = self.df_coordinates[self.df_coordinates['page_number'] == page_num+1]
for index, row in df_filtered.iterrows():
x0 = row['x0'] * im_page.size[0]
y0 = row['y0'] * im_page.size[1]
x1 = row['x1'] * im_page.size[0]
y1 = row['y1'] * im_page.size[1]
x2 = row['x2'] * im_page.size[0]
y2 = row['y2'] * im_page.size[1]
x3 = row['x3'] * im_page.size[0]
y3 = row['y3'] * im_page.size[1]
coords = [x0,y0,x1,y1,x2,y2,x3,y3]
draw.polygon(coords,outline='blue',fill='yellow')
redacted_images_bin.append(im_page)
return redacted_images_bin
def redacted_pdf(self):
redacted_images = self._redact_images()
pdf = FPDF()
pdf.set_auto_page_break(0)
for index,img_redacted in enumerate(redacted_images):
img_redacted.save(f"image_{index}.png")
pdf.add_page()
pdf.image(f"image_{index}.png",w=210,h=297)
os.remove(f"image_{index}.png") # I would like to avoid file handling!
pdf.output("doc.pdf","F") # I would like to avoid file handling!
#return pdf #this is what I want, to return the pdf as base64 or binary
在 documentation 中,我发现您可以使用
获取字符串形式的 PDF
pdf_string = pdf.output(dest='S')
所以你可以使用标准模块base64
import fpdf
import base64
pdf = fpdf.FPDF()
# ... add some elements ...
pdf_string = pdf.output(dest='S')
pdf_bytes = pdf_string.encode('utf-8')
base64_bytes = base64.b64encode(pdf_bytes)
base64_string = base64_bytes.decode('utf-8')
print(base64_string)
结果:
JVBERi0xLjMKMyAwIG9iago8PC9UeXBlIC9QYWdlCi9QYXJlbnQgMSAwIFIKL1Jlc291cmNlcyAyIDAgUgovQ29udGVudHMgNCAwIFI+PgplbmRvYmoKNCAwIG9iago8PC9GaWx0ZXIgL0ZsYXRlRGVjb2RlIC9MZW5ndGggMTk+PgpzdHJlYW0KeMKcM1LDsMOiMsOQMzVXKMOnAgALw7wCEgplbmRzdHJlYW0KZW5kb2JqCjEgMCBvYmoKPDwvVHlwZSAvUGFnZXMKL0tpZHMgWzMgMCBSIF0KL0NvdW50IDEKL01lZGlhQm94IFswIDAgNTk1LjI4IDg0MS44OV0KPj4KZW5kb2JqCjIgMCBvYmoKPDwKL1Byb2NTZXQgWy9QREYgL1RleHQgL0ltYWdlQiAvSW1hZ2VDIC9JbWFnZUldCi9Gb250IDw8Cj4+Ci9YT2JqZWN0IDw8Cj4+Cj4+CmVuZG9iago1IDAgb2JqCjw8Ci9Qcm9kdWNlciAoUHlGUERGIDEuNy4yIGh0dHA6Ly9weWZwZGYuZ29vZ2xlY29kZS5jb20vKQovQ3JlYXRpb25EYXRlIChEOjIwMjIwMjE3MjExMDE3KQo+PgplbmRvYmoKNiAwIG9iago8PAovVHlwZSAvQ2F0YWxvZwovUGFnZXMgMSAwIFIKL09wZW5BY3Rpb24gWzMgMCBSIC9GaXRIIG51bGxdCi9QYWdlTGF5b3V0IC9PbmVDb2x1bW4KPj4KZW5kb2JqCnhyZWYKMCA3CjAwMDAwMDAwMDAgNjU1MzUgZiAKMDAwMDAwMDE3NSAwMDAwMCBuIAowMDAwMDAwMjYyIDAwMDAwIG4gCjAwMDAwMDAwMDkgMDAwMDAgbiAKMDAwMDAwMDA4NyAwMDAwMCBuIAowMDAwMDAwMzU2IDAwMDAwIG4gCjAwMDAwMDA0NjUgMDAwMDAgbiAKdHJhaWxlcgo8PAovU2l6ZSA3Ci9Sb290IDYgMCBSCi9JbmZvIDUgMCBSCj4+CnN0YXJ0eHJlZgo1NjgKJSVFT0YK
至于image()
:它需要filename
(或url
)并且不能与string
或io.BytesIO()
一起使用。
最终你可能会得到source code,你可以尝试改变它。
GitHub 上什至有请求:Support for StringIO objects as images
编辑:
我发现有一个分支 fpdf2
可以在 image()
中使用 pillow.Image
- 参见 fpdf2 Image
并且在 source code 中我发现 image()
也可以与 io.BytesIO()
一起使用
fpdf2
的示例代码(output()
给出 bytes
而不是 string
)
import fpdf
import base64
from PIL import Image
import io
#print(fpdf.__version__)
pdf = fpdf.FPDF()
pdf.add_page()
pdf.image('lenna.png')
pdf.image('https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png')
f = open('lenna.png', 'rb')
pdf.image(f)
f = Image.open('lenna.png')
pdf.image(f)
f = open('lenna.png', 'rb')
b = io.BytesIO()
b.write(f.read())
pdf.image(b)
# save in file
pdf.output('output.pdf')
# get as bytes
pdf_bytes = pdf.output()
#print(pdf_bytes)
base64_bytes = base64.b64encode(pdf_bytes)
base64_string = base64_bytes.decode('utf-8')
print(base64_string)
fpdf2
中的写作测试
import fpdf
pdf = fpdf.FPDF()
pdf.add_page()
pdf.image('https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png')
# --- test 1 ---
pdf.output('output-test-1.pdf')
# --- test 2 ---
pdf_bytes = pdf.output()
with open('output-test-2.pdf', 'wb') as f: # it will close automatically
f.write(pdf_bytes)
# --- test 2 ---
pdf_bytes = pdf.output()
f = open('output-test-3.pdf', 'wb')
f.write(pdf_bytes)
f.close() # don't forget to close when you write
上下文: 我的代码从一些 png 文档中获取一组坐标,然后在某些字段中执行一些编辑(它使用这些坐标在某些区域绘制矩形)。
我希望我的最终输出是一个 pdf 文件,每个经过编辑的图像作为页面。我可以毫无问题地使用 fpdf 包实现此目的。
但是,我打算将此 pdf 文件作为电子邮件(base64 编码)附件发送。有没有办法从 fpdf 输出中获取 base64 字符串?
最重要的是,我可以在 fpdf 图像方法中使用图像二进制字符串吗?
请参阅下面的 redact_pdf 方法(我在此处放置了一些注释以便更清楚)
代码:
class Redaction:
def __init__(self,png_image_list,df_coordinates):
self.png_image_list = png_image_list
self.df_coordinates = df_coordinates
def _redact_images(self):
redacted_images_bin = []
for page_num,page_data in enumerate(self.png_image_list):
im_page = Image.open(io.BytesIO(page_data))
draw = ImageDraw.Draw(im_page)
df_filtered = self.df_coordinates[self.df_coordinates['page_number'] == page_num+1]
for index, row in df_filtered.iterrows():
x0 = row['x0'] * im_page.size[0]
y0 = row['y0'] * im_page.size[1]
x1 = row['x1'] * im_page.size[0]
y1 = row['y1'] * im_page.size[1]
x2 = row['x2'] * im_page.size[0]
y2 = row['y2'] * im_page.size[1]
x3 = row['x3'] * im_page.size[0]
y3 = row['y3'] * im_page.size[1]
coords = [x0,y0,x1,y1,x2,y2,x3,y3]
draw.polygon(coords,outline='blue',fill='yellow')
redacted_images_bin.append(im_page)
return redacted_images_bin
def redacted_pdf(self):
redacted_images = self._redact_images()
pdf = FPDF()
pdf.set_auto_page_break(0)
for index,img_redacted in enumerate(redacted_images):
img_redacted.save(f"image_{index}.png")
pdf.add_page()
pdf.image(f"image_{index}.png",w=210,h=297)
os.remove(f"image_{index}.png") # I would like to avoid file handling!
pdf.output("doc.pdf","F") # I would like to avoid file handling!
#return pdf #this is what I want, to return the pdf as base64 or binary
在 documentation 中,我发现您可以使用
获取字符串形式的 PDFpdf_string = pdf.output(dest='S')
所以你可以使用标准模块base64
import fpdf
import base64
pdf = fpdf.FPDF()
# ... add some elements ...
pdf_string = pdf.output(dest='S')
pdf_bytes = pdf_string.encode('utf-8')
base64_bytes = base64.b64encode(pdf_bytes)
base64_string = base64_bytes.decode('utf-8')
print(base64_string)
结果:
JVBERi0xLjMKMyAwIG9iago8PC9UeXBlIC9QYWdlCi9QYXJlbnQgMSAwIFIKL1Jlc291cmNlcyAyIDAgUgovQ29udGVudHMgNCAwIFI+PgplbmRvYmoKNCAwIG9iago8PC9GaWx0ZXIgL0ZsYXRlRGVjb2RlIC9MZW5ndGggMTk+PgpzdHJlYW0KeMKcM1LDsMOiMsOQMzVXKMOnAgALw7wCEgplbmRzdHJlYW0KZW5kb2JqCjEgMCBvYmoKPDwvVHlwZSAvUGFnZXMKL0tpZHMgWzMgMCBSIF0KL0NvdW50IDEKL01lZGlhQm94IFswIDAgNTk1LjI4IDg0MS44OV0KPj4KZW5kb2JqCjIgMCBvYmoKPDwKL1Byb2NTZXQgWy9QREYgL1RleHQgL0ltYWdlQiAvSW1hZ2VDIC9JbWFnZUldCi9Gb250IDw8Cj4+Ci9YT2JqZWN0IDw8Cj4+Cj4+CmVuZG9iago1IDAgb2JqCjw8Ci9Qcm9kdWNlciAoUHlGUERGIDEuNy4yIGh0dHA6Ly9weWZwZGYuZ29vZ2xlY29kZS5jb20vKQovQ3JlYXRpb25EYXRlIChEOjIwMjIwMjE3MjExMDE3KQo+PgplbmRvYmoKNiAwIG9iago8PAovVHlwZSAvQ2F0YWxvZwovUGFnZXMgMSAwIFIKL09wZW5BY3Rpb24gWzMgMCBSIC9GaXRIIG51bGxdCi9QYWdlTGF5b3V0IC9PbmVDb2x1bW4KPj4KZW5kb2JqCnhyZWYKMCA3CjAwMDAwMDAwMDAgNjU1MzUgZiAKMDAwMDAwMDE3NSAwMDAwMCBuIAowMDAwMDAwMjYyIDAwMDAwIG4gCjAwMDAwMDAwMDkgMDAwMDAgbiAKMDAwMDAwMDA4NyAwMDAwMCBuIAowMDAwMDAwMzU2IDAwMDAwIG4gCjAwMDAwMDA0NjUgMDAwMDAgbiAKdHJhaWxlcgo8PAovU2l6ZSA3Ci9Sb290IDYgMCBSCi9JbmZvIDUgMCBSCj4+CnN0YXJ0eHJlZgo1NjgKJSVFT0YK
至于image()
:它需要filename
(或url
)并且不能与string
或io.BytesIO()
一起使用。
最终你可能会得到source code,你可以尝试改变它。
GitHub 上什至有请求:Support for StringIO objects as images
编辑:
我发现有一个分支 fpdf2
可以在 image()
中使用 pillow.Image
- 参见 fpdf2 Image
并且在 source code 中我发现 image()
也可以与 io.BytesIO()
fpdf2
的示例代码(output()
给出 bytes
而不是 string
)
import fpdf
import base64
from PIL import Image
import io
#print(fpdf.__version__)
pdf = fpdf.FPDF()
pdf.add_page()
pdf.image('lenna.png')
pdf.image('https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png')
f = open('lenna.png', 'rb')
pdf.image(f)
f = Image.open('lenna.png')
pdf.image(f)
f = open('lenna.png', 'rb')
b = io.BytesIO()
b.write(f.read())
pdf.image(b)
# save in file
pdf.output('output.pdf')
# get as bytes
pdf_bytes = pdf.output()
#print(pdf_bytes)
base64_bytes = base64.b64encode(pdf_bytes)
base64_string = base64_bytes.decode('utf-8')
print(base64_string)
fpdf2
import fpdf
pdf = fpdf.FPDF()
pdf.add_page()
pdf.image('https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png')
# --- test 1 ---
pdf.output('output-test-1.pdf')
# --- test 2 ---
pdf_bytes = pdf.output()
with open('output-test-2.pdf', 'wb') as f: # it will close automatically
f.write(pdf_bytes)
# --- test 2 ---
pdf_bytes = pdf.output()
f = open('output-test-3.pdf', 'wb')
f.write(pdf_bytes)
f.close() # don't forget to close when you write