从注释(PdfTex 标记注释)创建一个矩形以提取突出显示的内容?

Create an Rectangle from an Annotation (PdfTextMarkupAnnotation) to extract highlighted content?

我想获取我在 pdf 文件中标记的文本。我遍历 PdfPagePdfAnnotation。 annoation 有一个方法 getRectangle() which return a PdfArray。我无法从 PdfArray 创建一个 Rectangle runtime-class (object/instance),它具有位置并覆盖在注释的标记文本上。

使用注释中的 Rectangle 我想通过 LocationtextExtratctionStrategy 过滤标记的内容。

我写了下面的代码来用 iText 获取它:

package biz.hochguertel;

import com.itextpdf.kernel.color.DeviceCmyk;
import com.itextpdf.kernel.events.Event;
import com.itextpdf.kernel.events.IEventHandler;
import com.itextpdf.kernel.events.PdfDocumentEvent;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.*;
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;

import java.io.File;
import java.io.IOException;
import java.util.List;

public class AppIText {

    private String filePath = getClass().getClassLoader().getResource("itext/OCA/549_OCA_Java_SE_7_Programmer_I_Certification.pdf").getFile();
    private static String DEST = "demo-output/549_OCA_Java_SE_7_Programmer_I_Certification.pdf";
    private PdfDocument pdfDocument;
    private PdfDocument pdfWriteDoc;

    public void before() throws IOException {
        File file = new File(DEST);
        file.getParentFile().mkdir();
        if (file.exists()) {
            file.delete();
        }
        pdfDocument = new PdfDocument(new PdfReader(filePath));
        pdfWriteDoc = new PdfDocument(new PdfWriter(DEST));
    }

    public static void main(String[] args) throws IOException {
        AppIText appIText = new AppIText();
        appIText.before();
        appIText.process();
        appIText.close();
    }

    private void close() {
        pdfDocument.close();
        pdfWriteDoc.close();
    }

    private void process() {
        for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
            PdfPage page = pdfDocument.getPage(i);

            List<PdfPage> newPdfPages = pdfDocument.copyPagesTo(i, i, pdfWriteDoc);
            PdfPage newPage = null;
            if (newPdfPages.size() > 0) {
                newPage = newPdfPages.get(0);
            }

            List<PdfAnnotation> annotations = page.getAnnotations();
            for (PdfAnnotation annotation : annotations) {
                if (annotation.getContents() != null) {
                    System.out.println(annotation.getContents());
                    if (annotation instanceof PdfTextMarkupAnnotation) {
                        PdfArray rectangleArray = annotation.getRectangle();
                        double x = ((PdfNumber) rectangleArray.get(0)).getValue();
                        double y = ((PdfNumber) rectangleArray.get(1)).getValue();
                        double xWidth = ((PdfNumber) rectangleArray.get(2)).getValue();
                        double yWidth = ((PdfNumber) rectangleArray.get(3)).getValue();
                        System.out.println(String.format("x=%s,y=%s,w=%s,h=%s", x, y, xWidth, yWidth));
                        Rectangle rectangle = new Rectangle((float) x, (float) y, (float) xWidth, (float) yWidth);

                        PdfCanvas canvas = new PdfCanvas(newPage);
                        canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
                                .rectangle(rectangle)
                                .fillStroke()
                        ;

                        FontFilter fontFilter = new FontFilter(rectangle);
                        FilteredEventListener listener = new FilteredEventListener();
                        LocationTextExtractionStrategy extractionStrategy = listener.attachEventListener(new LocationTextExtractionStrategy(), fontFilter);
                        new PdfCanvasProcessor(listener).processPageContent(page);
                        String actualText = extractionStrategy.getResultantText();
                    }
                }
            }
        }
    }

}

class RectangleEventHandler implements IEventHandler {
    @Override
    public void handleEvent(Event event) {
        PdfDocumentEvent docEvent = (PdfDocumentEvent) event;
        PdfDocument pdfDoc = docEvent.getDocument();
        PdfPage page = docEvent.getPage();
        PdfCanvas canvas = new PdfCanvas(page.getLastContentStream(), page.getResources(), pdfDoc);
        canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
                .rectangle(new Rectangle(20, 10, 10, 820))
                .fillStroke();
    }
}

class FontFilter extends TextRegionEventFilter {
    public FontFilter(Rectangle filterRect) {
        super(filterRect);
    }

    @Override
    public boolean accept(IEventData data, EventType type) {
        if (type.equals(EventType.RENDER_TEXT)) {
            TextRenderInfo renderInfo = (TextRenderInfo) data;

            PdfFont font = renderInfo.getFont();
            if (null != font) {
                String fontName = font.getFontProgram().getFontNames().getFontName();
                return fontName.endsWith("Bold") || fontName.endsWith("Oblique");
            }
        }
        return false;
    }
}

上面代码的以下主要部分是应用:

Rectangle rectangle = new Rectangle((float) x, (float) y, (float) xWidth, (float) yWidth);

我找到了解决方案。

矩形计算:
x: annotation.x
是:annotation.y
宽度:annotation.width - annotation.x
身高:annotation.height - annotation.y

我现在得到的是:

可视化调试(如果 LOG_LEVEL >= 100):

提取的内容:

13:50:01.323 [main] INFO  b.h.AppIText - Annotation contents: q(7.1).explain(1)
13:50:01.323 [main] INFO  b.h.AppIText - rectangleArray: x=90.0338, y=438.245, w=468.33, h=489.749
13:50:01.323 [main] INFO  b.h.AppIText - pageSizeWithRotation: x=0.0, y=0.0, w=531.0, h=666.0, top=666.0, bottom=0.0, left=0.0, right=531.0
13:50:01.337 [main] INFO  b.h.AppIText - str: Purpose: A finally block can’t be placed before the catch blocks. <cut here because the book is not free, but I get the complete marked text..>

我的固定代码现在看起来是:

package biz.hochguertel;

import com.itextpdf.kernel.color.DeviceCmyk;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.*;
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredTextEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.List;


/**
 * With the help of the following documentations:
 * - http://developers.itextpdf.com/content/best-itext-questions-stackoverview/content-parsing-extraction-and-redaction-text/itext7-how-read-text-specific-position
 */
public class AppIText {

    private static final Logger LOGGER = LoggerFactory.getLogger(AppIText.class);
    private static int LOG_LEVEL = 0;
    private final static int VISUAL_DEBUG = 100;

    private String filePath = getClass().getClassLoader().getResource("itext/OCA/393-394,549-550_OCA_Java_SE_7_Programmer_I_Certification.pdf").getFile();
    private static String DEST = "demo-output/393-394,549-550_OCA_Java_SE_7_Programmer_I_Certification.pdf";
    private PdfDocument pdfDocument;
    private PdfDocument pdfWriteDoc;


    public void before() throws IOException {
        File file = new File(DEST);
        file.getParentFile().mkdir();
        if (file.exists()) {
            file.delete();
        }
        pdfDocument = new PdfDocument(new PdfReader(filePath));
        pdfWriteDoc = new PdfDocument(new PdfWriter(DEST));
    }

    public static void main(String[] args) throws IOException {
        AppIText appIText = new AppIText();
        appIText.before();
        appIText.process();
    }

    private void process() {
        for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
            PdfPage page = pdfDocument.getPage(i);

            List<PdfPage> newPdfPages = pdfDocument.copyPagesTo(i, i, pdfWriteDoc);
            PdfPage newPage = null;
            if (newPdfPages.size() > 0) {
                newPage = newPdfPages.get(0);
            }

            List<PdfAnnotation> annotations = page.getAnnotations();
            for (PdfAnnotation annotation : annotations) {
                if (annotation.getContents() != null) {
                    System.out.println();
                    LOGGER.info("Annotation contents: {}", annotation.getContents());
                    if (annotation instanceof PdfTextMarkupAnnotation) {
                        PdfArray rectangleArray = annotation.getRectangle();
                        LOGGER.info("rectangleArray: x={}, y={}, w={}, h={}",
                                rectangleArray.get(0),
                                rectangleArray.get(1),
                                rectangleArray.get(2),
                                rectangleArray.get(3)
                        );
                        Rectangle pageSizeWithRotation = page.getCropBox();
                        LOGGER.info("pageSizeWithRotation: x={}, y={}, w={}, h={}, top={}, bottom={}, left={}, right={}",
                                pageSizeWithRotation.getX(),
                                pageSizeWithRotation.getY(),
                                pageSizeWithRotation.getWidth(),
                                pageSizeWithRotation.getHeight(),
                                pageSizeWithRotation.getTop(),
                                pageSizeWithRotation.getBottom(),
                                pageSizeWithRotation.getLeft(),
                                pageSizeWithRotation.getRight()
                        );
                        float x = ((PdfNumber) rectangleArray.get(0)).floatValue();
                        float y = ((PdfNumber) rectangleArray.get(1)).floatValue();
                        float width = ((PdfNumber) rectangleArray.get(2)).floatValue() - x;
                        float height = ((PdfNumber) rectangleArray.get(3)).floatValue() - y;
                        Rectangle rectangle = new Rectangle(
                                                                   x,
                                                                   y,
                                                                   width,
                                                                   height
                        );
                        //13:10:33.097 [main] INFO  b.h.AppIText - Annotation contents: q(7.1).explain(1)
                        //13:10:33.107 [main] INFO  b.h.AppIText - rectangleArray: x=90.0338, y=438.245, w=468.33, h=489.749
                        //13:10:33.107 [main] INFO  b.h.AppIText - pageSizeWithRotation: x=0.0, y=0.0, w=531.0, h=666.0, top=666.0, bottom=0.0, left=0.0, right=531.0
                        //width:  468.33f - 90.0388f,
                        //height: 489.749f - 438.245f

                        if (LOG_LEVEL >= VISUAL_DEBUG) {
                            PdfCanvas canvas = new PdfCanvas(newPage);
                            canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
                                    .rectangle(rectangle)
                                    .fillStroke();
                        }

                        TextRegionEventFilter regionFilter = new TextRegionEventFilter(rectangle);
                        ITextExtractionStrategy strategy = new FilteredTextEventListener(new LocationTextExtractionStrategy(), regionFilter);
                        String str = PdfTextExtractor.getTextFromPage(page, strategy) + "\n";
                        LOGGER.info("str: {}", str);
                    }
                }
            }
        }
        pdfDocument.close();
        pdfWriteDoc.close();
    }

}