将 pdf 从 A4 拆分为 A6 分区并且不要保存空分区
Split pdf from A4 into A6 quarters and don't save empty quarters
不严格判断,我是自学初学者)))
请帮我弄清楚如何分享我在 PyPDF2 和 PyMuPDF (fitz) 的帮助下学到的东西。但是在拆分的时候,经常会出现只有四分之一有文本的情况,但是它把所有4个季度都写到新文件中,既有文本也有空,一个有文本,其余的都是空的,我需要一些东西让空的那些没有保存,我想以某种方式进行检查,但没有成功,缺乏知识。我尝试读取新记录的文件并删除空页,但每页都有文字,即使是空页,我在acrobat reader中打开文件,但页面是空的,我不明白如何.
这是我的代码,以防万一我是怎么做的:https://paste.aiogram.dev/opiquhehus.py
这是我第一次在这里发帖,我不知道如何附加文件。例如电报频道中的 pdf 文件:https://t.me/+Tq7WpP1ImcjQXSZF.
import copy
import logging
import random
from pathlib import Path
import PyPDF2
import fitz
from PyPDF2.filters import decodeStreamData, ASCII85Decode
from PyPDF2.generic import EncodedStreamObject, DecodedStreamObject
def from_a4_to_a6_not_sync(input_file, output_file):
input_file = str(input_file.absolute())
pdf_reader = PyPDF2.PdfFileReader(input_file)
# print(f'{pdf_reader.getNumPages()=}')
# print(f'{pdf_reader.documentInfo=}')
first_page = pdf_reader.getPage(0)
left_up_side = copy.deepcopy(first_page)
right_up_side = copy.deepcopy(first_page)
left_down_side = copy.deepcopy(first_page)
right_down_side = copy.deepcopy(first_page)
# print(f'{left_up_side.extractText()=}')
# print(f'{right_up_side.extractText()=}')
# print(f'\nДО ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nДО ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
# second_page = pdf_reader.getPage(0)
# print(f'{type(second_page)=}\n{second_page.extractText()=}')
# third_page = pdf_reader.getPage(0)
# fourth_page = pdf_reader.getPage(0)
first_coord = first_page.mediaBox.upperRight[0]
second_coord = first_page.mediaBox.upperRight[1]
# print(f'{first_coord=}')
# print(f'{second_coord=}')
# cords_upperLeft = first_page.mediaBox.upperLeft
# cords_lowerLeft = first_page.mediaBox.lowerLeft
# cords_upperRight = first_page.mediaBox.upperRight
# cords_lowerRight = first_page.mediaBox.lowerRight
# print(f'{cords_upperLeft=}')
# print(f'{cords_lowerLeft=}')
# print(f'{cords_upperRight=}')
# print(f'{cords_lowerRight=}')
# first_page.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# second_page.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# third_page.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# fourth_page.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_up_side.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_up_side.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_down_side.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_down_side.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# print(f'{first_page=}\n\n')
# one_page = left_up_side.getContents()
# second_page = right_up_side.getContents()
# decode_one = DecodedStreamObject()
# print(f'{decode_one.getData()}')
# print(f'{decodeStreamData(second_page)}')
# print(f'ПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'{left_up_side.extractText().encode("utf8")=} {type(left_up_side.extractText())=}')
# print(f'{right_up_side.extractText().encode("utf8")=} {type(right_up_side.extractText())=}')
# print(f'{left_up_side.getContents()=} {type(left_up_side.getContents())=}')
# print(f'{right_up_side.getContents()=} {type(right_up_side.getContents())=}')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
pdf_writer = PyPDF2.PdfFileWriter()
# pdf_writer.addPage(first_page)
pdf_writer.addPage(left_up_side)
pdf_writer.addPage(right_up_side)
with open(output_file, 'wb') as file:
pdf_writer.write(file)
file.close()
def fitz_four_piaces(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
page = 0
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 + (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 + (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
count = 0 # почему-то не считает
rx += d # add the CropBox displacement
# print(f'{rx=}')
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the image
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# print(f'{spage.number=}')
# text_in_page = page.get_text("text")#.encode("utf8")
# print(f'{text_in_page=}')
# print(f'{count=} {doc.get_page_text(doc.page_count - 1)=}')
# print(f'in cicle {doc.page_count - 1=}')
count += 1
# that's it, save output file
# print(f'{doc.metadata=}')
# print(f'{doc.page_count=}')
doc.save(output_file, #
garbage=3, # eliminate duplicate objects
deflate=True, # compress stuff where possible
)
# input_file2 = str(output_file.absolute())
# src2 = fitz.open(input_file2)
# print(f'{src2.page_count=}')
# for page in src2:
# print(f'{page.get_text("words")=}')
def fitz_four_piaces_read(input_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
print(f'{src.page_count=}')
for page in src:
print(f'{page.get_text("text")=}')
destination = Path().joinpath("MAKETS")
destination.mkdir(parents=True, exist_ok=True)
destination_input = destination.joinpath(
f'up_lef.pdf') # up_lef_up_rig_low_lef_low_rig
destination_output = destination.joinpath(
f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf') # f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf'
# from_a4_to_a6_not_sync(destination_input, destination_output)
fitz_four_piaces(destination_input, destination_output)
fitz_four_piaces_read(destination_output)
已找到解决方案!需要将页面分成4部分后,将得到的页面转换成图片,然后比较大小。我会分享代码,也许对某人有用)
import os
import fitz
def get_size(filename):
st = os.stat(filename)
return st.st_size
async def from_a4_to_a6(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 + (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 + (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
rx += d # add the CropBox displacement
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the imageb
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# Here we will convert the pdf to an image and check the size
pix = page.get_pixmap() # render page to an image
name_png = f"page-{page.number}.png" # _{random.randint(1,100)}
pix.save(name_png) # store image as a PNG
imgsize = get_size(name_png)
os.remove(name_png)
if imgsize < 1300: # A6 blank page size approximately 1209 Yours may be different, check first
doc.delete_page(pno=-1)
break
doc.save(output_file,
garbage=4, # eliminate duplicate objects
clean=True,
deflate=True, # compress stuff where possible
)
不严格判断,我是自学初学者)))
请帮我弄清楚如何分享我在 PyPDF2 和 PyMuPDF (fitz) 的帮助下学到的东西。但是在拆分的时候,经常会出现只有四分之一有文本的情况,但是它把所有4个季度都写到新文件中,既有文本也有空,一个有文本,其余的都是空的,我需要一些东西让空的那些没有保存,我想以某种方式进行检查,但没有成功,缺乏知识。我尝试读取新记录的文件并删除空页,但每页都有文字,即使是空页,我在acrobat reader中打开文件,但页面是空的,我不明白如何.
这是我的代码,以防万一我是怎么做的:https://paste.aiogram.dev/opiquhehus.py
这是我第一次在这里发帖,我不知道如何附加文件。例如电报频道中的 pdf 文件:https://t.me/+Tq7WpP1ImcjQXSZF.
import copy
import logging
import random
from pathlib import Path
import PyPDF2
import fitz
from PyPDF2.filters import decodeStreamData, ASCII85Decode
from PyPDF2.generic import EncodedStreamObject, DecodedStreamObject
def from_a4_to_a6_not_sync(input_file, output_file):
input_file = str(input_file.absolute())
pdf_reader = PyPDF2.PdfFileReader(input_file)
# print(f'{pdf_reader.getNumPages()=}')
# print(f'{pdf_reader.documentInfo=}')
first_page = pdf_reader.getPage(0)
left_up_side = copy.deepcopy(first_page)
right_up_side = copy.deepcopy(first_page)
left_down_side = copy.deepcopy(first_page)
right_down_side = copy.deepcopy(first_page)
# print(f'{left_up_side.extractText()=}')
# print(f'{right_up_side.extractText()=}')
# print(f'\nДО ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nДО ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
# second_page = pdf_reader.getPage(0)
# print(f'{type(second_page)=}\n{second_page.extractText()=}')
# third_page = pdf_reader.getPage(0)
# fourth_page = pdf_reader.getPage(0)
first_coord = first_page.mediaBox.upperRight[0]
second_coord = first_page.mediaBox.upperRight[1]
# print(f'{first_coord=}')
# print(f'{second_coord=}')
# cords_upperLeft = first_page.mediaBox.upperLeft
# cords_lowerLeft = first_page.mediaBox.lowerLeft
# cords_upperRight = first_page.mediaBox.upperRight
# cords_lowerRight = first_page.mediaBox.lowerRight
# print(f'{cords_upperLeft=}')
# print(f'{cords_lowerLeft=}')
# print(f'{cords_upperRight=}')
# print(f'{cords_lowerRight=}')
# first_page.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# second_page.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# third_page.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# fourth_page.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_up_side.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_up_side.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_down_side.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_down_side.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# print(f'{first_page=}\n\n')
# one_page = left_up_side.getContents()
# second_page = right_up_side.getContents()
# decode_one = DecodedStreamObject()
# print(f'{decode_one.getData()}')
# print(f'{decodeStreamData(second_page)}')
# print(f'ПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'{left_up_side.extractText().encode("utf8")=} {type(left_up_side.extractText())=}')
# print(f'{right_up_side.extractText().encode("utf8")=} {type(right_up_side.extractText())=}')
# print(f'{left_up_side.getContents()=} {type(left_up_side.getContents())=}')
# print(f'{right_up_side.getContents()=} {type(right_up_side.getContents())=}')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
pdf_writer = PyPDF2.PdfFileWriter()
# pdf_writer.addPage(first_page)
pdf_writer.addPage(left_up_side)
pdf_writer.addPage(right_up_side)
with open(output_file, 'wb') as file:
pdf_writer.write(file)
file.close()
def fitz_four_piaces(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
page = 0
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 + (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 + (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
count = 0 # почему-то не считает
rx += d # add the CropBox displacement
# print(f'{rx=}')
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the image
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# print(f'{spage.number=}')
# text_in_page = page.get_text("text")#.encode("utf8")
# print(f'{text_in_page=}')
# print(f'{count=} {doc.get_page_text(doc.page_count - 1)=}')
# print(f'in cicle {doc.page_count - 1=}')
count += 1
# that's it, save output file
# print(f'{doc.metadata=}')
# print(f'{doc.page_count=}')
doc.save(output_file, #
garbage=3, # eliminate duplicate objects
deflate=True, # compress stuff where possible
)
# input_file2 = str(output_file.absolute())
# src2 = fitz.open(input_file2)
# print(f'{src2.page_count=}')
# for page in src2:
# print(f'{page.get_text("words")=}')
def fitz_four_piaces_read(input_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
print(f'{src.page_count=}')
for page in src:
print(f'{page.get_text("text")=}')
destination = Path().joinpath("MAKETS")
destination.mkdir(parents=True, exist_ok=True)
destination_input = destination.joinpath(
f'up_lef.pdf') # up_lef_up_rig_low_lef_low_rig
destination_output = destination.joinpath(
f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf') # f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf'
# from_a4_to_a6_not_sync(destination_input, destination_output)
fitz_four_piaces(destination_input, destination_output)
fitz_four_piaces_read(destination_output)
已找到解决方案!需要将页面分成4部分后,将得到的页面转换成图片,然后比较大小。我会分享代码,也许对某人有用)
import os
import fitz
def get_size(filename):
st = os.stat(filename)
return st.st_size
async def from_a4_to_a6(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 + (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 + (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
rx += d # add the CropBox displacement
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the imageb
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# Here we will convert the pdf to an image and check the size
pix = page.get_pixmap() # render page to an image
name_png = f"page-{page.number}.png" # _{random.randint(1,100)}
pix.save(name_png) # store image as a PNG
imgsize = get_size(name_png)
os.remove(name_png)
if imgsize < 1300: # A6 blank page size approximately 1209 Yours may be different, check first
doc.delete_page(pno=-1)
break
doc.save(output_file,
garbage=4, # eliminate duplicate objects
clean=True,
deflate=True, # compress stuff where possible
)