提取 PDF 的特定页面并使用 Python 保存
Extract specific pages of PDF and save it with Python
我有一些资源并尝试编写代码以提取一些页面并创建 pdf 文件。
我有一个看起来像这样的列表
information = [(filename1,startpage1,endpage1), (filename2, startpage2, endpage2), ...,(filename19,startpage19,endpage19)].
这是我的代码。
from PyPDF2 import PdfFileReader, PdfFileWriter
reader = PdfFileReader("example.pdf")
for page in range(reader.getNumPages() - 1):
writer = PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start < end:
writer.addPage(reader.getPage(start))
start += 1
output_filename = "{}_{}_page_{}.pdf".format(
information[page][0], information[page][1], information[page][2]
)
with open(output_filename, "wb") as out:
writer.write(out)
但是输出很奇怪..有些里面什么都没有,有些只有一页。我该如何纠正?
我已经解决了这个问题。这是等号(开始<=结束)。
for page in range(len(information)):
pdf_writer = PyPDF2.PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start<=end:
pdf_writer.addPage(pdfReader.getPage(start-1))
start+=1
if not os.path.exists(savepath):
os.makedirs(savepath)
output_filename = '{}_{}_page_{}.pdf'.format(information[page][0],information[page][1], information[page][2])
with open(output_filename,'wb') as out:
pdf_writer.write(out)
完整代码,我修改了 SSS 的答案,使其可移植、灵活并与多个源 pdf 并发。
我无法测试 ThreadPoolExecutor 和 ProcessPoolExecutor 之间的性能差异,但我假设提取过程受 PDF 的读写限制,而不是受 getPage 和 addPage 限制。
import concurrent.futures
from multiprocessing import freeze_support
from pathlib import Path
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_extract(pdf, segments):
"""
pdf: str | Path
segments: [(start, end), {'start': int, 'end': int}]
"""
with open(pdf, 'rb') as read_stream:
pdf_reader = PdfFileReader(read_stream)
for segment in segments:
pdf_writer = PdfFileWriter()
# support {'start': 3, 'end': 3} or (start, end)
try:
start_page, end_page = segment['start'], segment['end']
except TypeError:
start_page, end_page = segment
for page_num in range(start_page - 1, end_page):
pdf_writer.addPage(pdf_reader.getPage(page_num))
p = Path(pdf)
ouput = p.parent / p.with_stem(f'{p.stem}_pages_{start_page}-{end_page}')
with open(ouput, 'wb') as out:
pdf_writer.write(out)
def __pdf_extract(pair):
return pdf_extract(*pair)
def pdf_extract_batch(pdfs, workers=20):
"""
pdfs = {pdf_name: [(1, 1), ...], ...}
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
executor.map(__pdf_extract, pdfs.items())
if __name__ == '__main__':
freeze_support()
pdf_name = r'C:\Users\maste\Documents\long.pdf'
segments = [(1, 1), {'start': 3, 'end': 5}]
# Single
pdf_extract(pdf_name, segments)
# Batched (Concurrent)
pdfs = {pdf_name: segments}
# pdf_extract_batch(pdfs)
我有一些资源并尝试编写代码以提取一些页面并创建 pdf 文件。 我有一个看起来像这样的列表
information = [(filename1,startpage1,endpage1), (filename2, startpage2, endpage2), ...,(filename19,startpage19,endpage19)].
这是我的代码。
from PyPDF2 import PdfFileReader, PdfFileWriter
reader = PdfFileReader("example.pdf")
for page in range(reader.getNumPages() - 1):
writer = PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start < end:
writer.addPage(reader.getPage(start))
start += 1
output_filename = "{}_{}_page_{}.pdf".format(
information[page][0], information[page][1], information[page][2]
)
with open(output_filename, "wb") as out:
writer.write(out)
但是输出很奇怪..有些里面什么都没有,有些只有一页。我该如何纠正?
我已经解决了这个问题。这是等号(开始<=结束)。
for page in range(len(information)):
pdf_writer = PyPDF2.PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start<=end:
pdf_writer.addPage(pdfReader.getPage(start-1))
start+=1
if not os.path.exists(savepath):
os.makedirs(savepath)
output_filename = '{}_{}_page_{}.pdf'.format(information[page][0],information[page][1], information[page][2])
with open(output_filename,'wb') as out:
pdf_writer.write(out)
完整代码,我修改了 SSS 的答案,使其可移植、灵活并与多个源 pdf 并发。 我无法测试 ThreadPoolExecutor 和 ProcessPoolExecutor 之间的性能差异,但我假设提取过程受 PDF 的读写限制,而不是受 getPage 和 addPage 限制。
import concurrent.futures
from multiprocessing import freeze_support
from pathlib import Path
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_extract(pdf, segments):
"""
pdf: str | Path
segments: [(start, end), {'start': int, 'end': int}]
"""
with open(pdf, 'rb') as read_stream:
pdf_reader = PdfFileReader(read_stream)
for segment in segments:
pdf_writer = PdfFileWriter()
# support {'start': 3, 'end': 3} or (start, end)
try:
start_page, end_page = segment['start'], segment['end']
except TypeError:
start_page, end_page = segment
for page_num in range(start_page - 1, end_page):
pdf_writer.addPage(pdf_reader.getPage(page_num))
p = Path(pdf)
ouput = p.parent / p.with_stem(f'{p.stem}_pages_{start_page}-{end_page}')
with open(ouput, 'wb') as out:
pdf_writer.write(out)
def __pdf_extract(pair):
return pdf_extract(*pair)
def pdf_extract_batch(pdfs, workers=20):
"""
pdfs = {pdf_name: [(1, 1), ...], ...}
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
executor.map(__pdf_extract, pdfs.items())
if __name__ == '__main__':
freeze_support()
pdf_name = r'C:\Users\maste\Documents\long.pdf'
segments = [(1, 1), {'start': 3, 'end': 5}]
# Single
pdf_extract(pdf_name, segments)
# Batched (Concurrent)
pdfs = {pdf_name: segments}
# pdf_extract_batch(pdfs)