使用 PyMuPDF 加粗文本部分

Use PyMuPDF to bold parts of text

我正在尝试使用 PyMuPDF 将 PDF 文件中每个单词的部分加粗。

因此,例如,包含字符串“There are many pies”的文件将导致“There are many pies

我看到您可以使用 Page.get_textpage().extractWORDS() 来提取各种单词的元组。但是,我不确定如何将其中的部分加粗。

我想也许你可以擦掉它们然后重写它们,但我不确定 PyMuPDF 是否可以擦除单词。

它看起来像 PyMuPDF does not allow the deleting of text,如您所说:

In a nutshell, this is what you can do with PyMuPDF:

  • Modify page rotation and the visible part (“cropbox”) of the page.
  • Insert images, other PDF pages, text and simple geometrical objects.
  • Add annotations and form fields.

因此,按照@K_J关于使用编辑的建议,我创建了这个脚本,它将:

  1. 检查页面文本 character-by-character
  2. 在词首建立PyMuPDF Character Dictionaries连续列表;我将这些连续字符列表称为 segments
  3. 创建一个覆盖每个段的超级 Rect 以及字符字符串,并使用这两个创建一个带有粗体的修订注释,它基本上“放置在”原始字符上

效果不完整,如密文注解tries to center itself vertically(见align 属性);我会尽力弥补这一点。

Original Highlighted

我敢打赌,通过在原始(可能仍经过编辑)文本之上创建新文本,效果会完整,但我 运行 目前没有时间来尝试。

import fitz

from fitz import Document, Page
from fitz import Matrix, Point, Rect


Normal_style = dict(fontname="helv", fontsize=24)
Bold_style = dict(fontname="hebo", fontsize=24)

RawDictChar = dict  # See "Character Dictionary for extractRAWDICT()" in PyMuPDF docs
CharSegment = list[RawDictChar]


def main():
    doc: Document = fitz.open()
    page: Page = doc.new_page()

    page.insert_text(Point(50, 72), "A number of words and things on line 1", **Normal_style)
    page.insert_text(Point(50, 144), "A number of words on line 2", **Normal_style)
    page.insert_text(Point(50, 216), "Line 3", **Normal_style)

    page_to_image(page, "page-orig.png")

    char_segments = get_char_segments(page)

    apply_segment_redactions(page, char_segments)

    page_to_image(page, "page-edit.png")


def get_char_segments(page: Page, num_chars: int = 3) -> list[CharSegment]:
    """
    Breaks a page down in groups ("segments") of individual characters, and returns a list of these "character segments".

    Each character segment is at most `num_chars` long and will be the first number of characters of a word (delimited by a space).
    """
    char_segments: list[CharSegment] = []

    rawdict = page.get_text("rawdict")
    for block in rawdict["blocks"]:
        if block["type"] == 1:
            continue  # skip "image" block

        for line in block["lines"]:
            for span in line["spans"]:
                chars = span["chars"]
                word_chars = []
                for char in chars:
                    # Break on "space"
                    if char["c"] == " ":
                        char_segments.append(word_chars[:num_chars])
                        word_chars = []
                        continue

                    word_chars.append(char)

                # Get any end-of-line chars
                if word_chars:
                    char_segments.append(word_chars[:num_chars])

    return char_segments


def apply_segment_redactions(page: Page, char_segments: list[CharSegment]):
    """Turns each character segment into a redaction annotation, applying the same characters but now in a boldened font."""
    M_shift_down = Matrix(1, 1).pretranslate(0, 2.5)  # try to compensate for redactions being vertically centered

    for char_segment in char_segments:
        first_cs = char_segment[0]

        # Build up replacement/redaction text
        highlight_txt = first_cs["c"]
        # Build up "super rect" of redaction area through rectangle unions of each subsequent char in segment
        highlight_rect: Rect = Rect(*first_cs["bbox"])

        for cs in char_segment[1:]:
            highlight_rect = highlight_rect | Rect(*cs["bbox"])
            highlight_txt += cs["c"]

        highlight_rect.transform(M_shift_down)

        page.add_redact_annot(highlight_rect, text=highlight_txt, fill=False, **Bold_style)

    page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)


def page_to_image(page: Page, fname):
    """Helper to visualize the original and redacted/highlighted."""
    Zoom_x = 2.0  # horizontal zoom
    Zoom_y = 2.0  # vertical zoom
    Z_matrix = fitz.Matrix(Zoom_x, Zoom_y)  # zoom factor 2 in each dimension

    pix = page.get_pixmap(matrix=Z_matrix)  # use 'mat' instead of the identity matrix
    pix.save(fname)  # store image as a PNG


if __name__ == "__main__":
    main()