使用 PyMuPDF 加粗文本部分
Use PyMuPDF to bold parts of text
我正在尝试使用 PyMuPDF 将 PDF 文件中每个单词的部分加粗。
因此,例如,包含字符串“There are many pies”的文件将导致“There are many pies
我看到您可以使用 Page.get_textpage().extractWORDS()
来提取各种单词的元组。但是,我不确定如何将其中的部分加粗。
我想也许你可以擦掉它们然后重写它们,但我不确定 PyMuPDF 是否可以擦除单词。
它看起来像 PyMuPDF does not allow the deleting of text,如您所说:
In a nutshell, this is what you can do with PyMuPDF:
- Modify page rotation and the visible part (“cropbox”) of the page.
- Insert images, other PDF pages, text and simple geometrical objects.
- Add annotations and form fields.
因此,按照@K_J关于使用编辑的建议,我创建了这个脚本,它将:
- 检查页面文本 character-by-character
- 在词首建立PyMuPDF Character Dictionaries连续列表;我将这些连续字符列表称为 segments
- 创建一个覆盖每个段的超级 Rect 以及字符字符串,并使用这两个创建一个带有粗体的修订注释,它基本上“放置在”原始字符上
效果不完整,如密文注解tries to center itself vertically(见align
属性);我会尽力弥补这一点。
Original
Highlighted
我敢打赌,通过在原始(可能仍经过编辑)文本之上创建新文本,效果会完整,但我 运行 目前没有时间来尝试。
import fitz
from fitz import Document, Page
from fitz import Matrix, Point, Rect
Normal_style = dict(fontname="helv", fontsize=24)
Bold_style = dict(fontname="hebo", fontsize=24)
RawDictChar = dict # See "Character Dictionary for extractRAWDICT()" in PyMuPDF docs
CharSegment = list[RawDictChar]
def main():
doc: Document = fitz.open()
page: Page = doc.new_page()
page.insert_text(Point(50, 72), "A number of words and things on line 1", **Normal_style)
page.insert_text(Point(50, 144), "A number of words on line 2", **Normal_style)
page.insert_text(Point(50, 216), "Line 3", **Normal_style)
page_to_image(page, "page-orig.png")
char_segments = get_char_segments(page)
apply_segment_redactions(page, char_segments)
page_to_image(page, "page-edit.png")
def get_char_segments(page: Page, num_chars: int = 3) -> list[CharSegment]:
"""
Breaks a page down in groups ("segments") of individual characters, and returns a list of these "character segments".
Each character segment is at most `num_chars` long and will be the first number of characters of a word (delimited by a space).
"""
char_segments: list[CharSegment] = []
rawdict = page.get_text("rawdict")
for block in rawdict["blocks"]:
if block["type"] == 1:
continue # skip "image" block
for line in block["lines"]:
for span in line["spans"]:
chars = span["chars"]
word_chars = []
for char in chars:
# Break on "space"
if char["c"] == " ":
char_segments.append(word_chars[:num_chars])
word_chars = []
continue
word_chars.append(char)
# Get any end-of-line chars
if word_chars:
char_segments.append(word_chars[:num_chars])
return char_segments
def apply_segment_redactions(page: Page, char_segments: list[CharSegment]):
"""Turns each character segment into a redaction annotation, applying the same characters but now in a boldened font."""
M_shift_down = Matrix(1, 1).pretranslate(0, 2.5) # try to compensate for redactions being vertically centered
for char_segment in char_segments:
first_cs = char_segment[0]
# Build up replacement/redaction text
highlight_txt = first_cs["c"]
# Build up "super rect" of redaction area through rectangle unions of each subsequent char in segment
highlight_rect: Rect = Rect(*first_cs["bbox"])
for cs in char_segment[1:]:
highlight_rect = highlight_rect | Rect(*cs["bbox"])
highlight_txt += cs["c"]
highlight_rect.transform(M_shift_down)
page.add_redact_annot(highlight_rect, text=highlight_txt, fill=False, **Bold_style)
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
def page_to_image(page: Page, fname):
"""Helper to visualize the original and redacted/highlighted."""
Zoom_x = 2.0 # horizontal zoom
Zoom_y = 2.0 # vertical zoom
Z_matrix = fitz.Matrix(Zoom_x, Zoom_y) # zoom factor 2 in each dimension
pix = page.get_pixmap(matrix=Z_matrix) # use 'mat' instead of the identity matrix
pix.save(fname) # store image as a PNG
if __name__ == "__main__":
main()
我正在尝试使用 PyMuPDF 将 PDF 文件中每个单词的部分加粗。
因此,例如,包含字符串“There are many pies”的文件将导致“There are many pies
我看到您可以使用 Page.get_textpage().extractWORDS()
来提取各种单词的元组。但是,我不确定如何将其中的部分加粗。
我想也许你可以擦掉它们然后重写它们,但我不确定 PyMuPDF 是否可以擦除单词。
它看起来像 PyMuPDF does not allow the deleting of text,如您所说:
In a nutshell, this is what you can do with PyMuPDF:
- Modify page rotation and the visible part (“cropbox”) of the page.
- Insert images, other PDF pages, text and simple geometrical objects.
- Add annotations and form fields.
因此,按照@K_J关于使用编辑的建议,我创建了这个脚本,它将:
- 检查页面文本 character-by-character
- 在词首建立PyMuPDF Character Dictionaries连续列表;我将这些连续字符列表称为 segments
- 创建一个覆盖每个段的超级 Rect 以及字符字符串,并使用这两个创建一个带有粗体的修订注释,它基本上“放置在”原始字符上
效果不完整,如密文注解tries to center itself vertically(见align
属性);我会尽力弥补这一点。
Original | Highlighted |
---|---|
我敢打赌,通过在原始(可能仍经过编辑)文本之上创建新文本,效果会完整,但我 运行 目前没有时间来尝试。
import fitz
from fitz import Document, Page
from fitz import Matrix, Point, Rect
Normal_style = dict(fontname="helv", fontsize=24)
Bold_style = dict(fontname="hebo", fontsize=24)
RawDictChar = dict # See "Character Dictionary for extractRAWDICT()" in PyMuPDF docs
CharSegment = list[RawDictChar]
def main():
doc: Document = fitz.open()
page: Page = doc.new_page()
page.insert_text(Point(50, 72), "A number of words and things on line 1", **Normal_style)
page.insert_text(Point(50, 144), "A number of words on line 2", **Normal_style)
page.insert_text(Point(50, 216), "Line 3", **Normal_style)
page_to_image(page, "page-orig.png")
char_segments = get_char_segments(page)
apply_segment_redactions(page, char_segments)
page_to_image(page, "page-edit.png")
def get_char_segments(page: Page, num_chars: int = 3) -> list[CharSegment]:
"""
Breaks a page down in groups ("segments") of individual characters, and returns a list of these "character segments".
Each character segment is at most `num_chars` long and will be the first number of characters of a word (delimited by a space).
"""
char_segments: list[CharSegment] = []
rawdict = page.get_text("rawdict")
for block in rawdict["blocks"]:
if block["type"] == 1:
continue # skip "image" block
for line in block["lines"]:
for span in line["spans"]:
chars = span["chars"]
word_chars = []
for char in chars:
# Break on "space"
if char["c"] == " ":
char_segments.append(word_chars[:num_chars])
word_chars = []
continue
word_chars.append(char)
# Get any end-of-line chars
if word_chars:
char_segments.append(word_chars[:num_chars])
return char_segments
def apply_segment_redactions(page: Page, char_segments: list[CharSegment]):
"""Turns each character segment into a redaction annotation, applying the same characters but now in a boldened font."""
M_shift_down = Matrix(1, 1).pretranslate(0, 2.5) # try to compensate for redactions being vertically centered
for char_segment in char_segments:
first_cs = char_segment[0]
# Build up replacement/redaction text
highlight_txt = first_cs["c"]
# Build up "super rect" of redaction area through rectangle unions of each subsequent char in segment
highlight_rect: Rect = Rect(*first_cs["bbox"])
for cs in char_segment[1:]:
highlight_rect = highlight_rect | Rect(*cs["bbox"])
highlight_txt += cs["c"]
highlight_rect.transform(M_shift_down)
page.add_redact_annot(highlight_rect, text=highlight_txt, fill=False, **Bold_style)
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
def page_to_image(page: Page, fname):
"""Helper to visualize the original and redacted/highlighted."""
Zoom_x = 2.0 # horizontal zoom
Zoom_y = 2.0 # vertical zoom
Z_matrix = fitz.Matrix(Zoom_x, Zoom_y) # zoom factor 2 in each dimension
pix = page.get_pixmap(matrix=Z_matrix) # use 'mat' instead of the identity matrix
pix.save(fname) # store image as a PNG
if __name__ == "__main__":
main()