使用 PyMuPDF 加粗文本部分

Question

我正在尝试使用 PyMuPDF 将 PDF 文件中每个单词的部分加粗。

因此，例如，包含字符串“There are many pies”的文件将导致“There are many pies

我看到您可以使用 Page.get_textpage().extractWORDS() 来提取各种单词的元组。但是，我不确定如何将其中的部分加粗。

我想也许你可以擦掉它们然后重写它们，但我不确定 PyMuPDF 是否可以擦除单词。

Answer 1

它看起来像 PyMuPDF does not allow the deleting of text，如您所说：

In a nutshell, this is what you can do with PyMuPDF:

Modify page rotation and the visible part (“cropbox”) of the page.

Insert images, other PDF pages, text and simple geometrical objects.

Add annotations and form fields.

因此，按照@K_J关于使用编辑的建议，我创建了这个脚本，它将：

检查页面文本 character-by-character
在词首建立PyMuPDF Character Dictionaries连续列表；我将这些连续字符列表称为 segments
创建一个覆盖每个段的超级 Rect 以及字符字符串，并使用这两个创建一个带有粗体的修订注释，它基本上“放置在”原始字符上

效果不完整，如密文注解tries to center itself vertically（见align 属性）；我会尽力弥补这一点。

Original	Highlighted

我敢打赌，通过在原始（可能仍经过编辑）文本之上创建新文本，效果会完整，但我运行目前没有时间来尝试。

import fitz

from fitz import Document, Page
from fitz import Matrix, Point, Rect


Normal_style = dict(fontname="helv", fontsize=24)
Bold_style = dict(fontname="hebo", fontsize=24)

RawDictChar = dict  # See "Character Dictionary for extractRAWDICT()" in PyMuPDF docs
CharSegment = list[RawDictChar]


def main():
    doc: Document = fitz.open()
    page: Page = doc.new_page()

    page.insert_text(Point(50, 72), "A number of words and things on line 1", **Normal_style)
    page.insert_text(Point(50, 144), "A number of words on line 2", **Normal_style)
    page.insert_text(Point(50, 216), "Line 3", **Normal_style)

    page_to_image(page, "page-orig.png")

    char_segments = get_char_segments(page)

    apply_segment_redactions(page, char_segments)

    page_to_image(page, "page-edit.png")


def get_char_segments(page: Page, num_chars: int = 3) -> list[CharSegment]:
    """
    Breaks a page down in groups ("segments") of individual characters, and returns a list of these "character segments".

    Each character segment is at most `num_chars` long and will be the first number of characters of a word (delimited by a space).
    """
    char_segments: list[CharSegment] = []

    rawdict = page.get_text("rawdict")
    for block in rawdict["blocks"]:
        if block["type"] == 1:
            continue  # skip "image" block

        for line in block["lines"]:
            for span in line["spans"]:
                chars = span["chars"]
                word_chars = []
                for char in chars:
                    # Break on "space"
                    if char["c"] == " ":
                        char_segments.append(word_chars[:num_chars])
                        word_chars = []
                        continue

                    word_chars.append(char)

                # Get any end-of-line chars
                if word_chars:
                    char_segments.append(word_chars[:num_chars])

    return char_segments


def apply_segment_redactions(page: Page, char_segments: list[CharSegment]):
    """Turns each character segment into a redaction annotation, applying the same characters but now in a boldened font."""
    M_shift_down = Matrix(1, 1).pretranslate(0, 2.5)  # try to compensate for redactions being vertically centered

    for char_segment in char_segments:
        first_cs = char_segment[0]

        # Build up replacement/redaction text
        highlight_txt = first_cs["c"]
        # Build up "super rect" of redaction area through rectangle unions of each subsequent char in segment
        highlight_rect: Rect = Rect(*first_cs["bbox"])

        for cs in char_segment[1:]:
            highlight_rect = highlight_rect | Rect(*cs["bbox"])
            highlight_txt += cs["c"]

        highlight_rect.transform(M_shift_down)

        page.add_redact_annot(highlight_rect, text=highlight_txt, fill=False, **Bold_style)

    page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)


def page_to_image(page: Page, fname):
    """Helper to visualize the original and redacted/highlighted."""
    Zoom_x = 2.0  # horizontal zoom
    Zoom_y = 2.0  # vertical zoom
    Z_matrix = fitz.Matrix(Zoom_x, Zoom_y)  # zoom factor 2 in each dimension

    pix = page.get_pixmap(matrix=Z_matrix)  # use 'mat' instead of the identity matrix
    pix.save(fname)  # store image as a PNG


if __name__ == "__main__":
    main()

使用 PyMuPDF 加粗文本部分

Use PyMuPDF to bold parts of text

python

pdf

highlight

pymupdf