提取文本位置时从 PdfBox 获得的负 X 或 Y

Negative X or Y obtained from PdfBox while extracting text position

我正在尝试提取 pdf 中的所有文本及其坐标。 我正在使用 Apache PDFBox 2.0.8 并遵循示例程序 DrawPrintTextLocations .

它似乎主要工作,但对于某些 pdf-s,我得到边界框的 x 和 y 坐标的负值。例如参考这个pdf file

我的应用假定坐标系为普通 pdf(x 从左到右,y 从上到下)。所以这些推翻了我的计算。

下面是相关的代码。

import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.List;

/**
 * This is an example on how to get some x/y coordinates of text and to show them in a rendered
 * image.
 *
 * @author Ben Litchfield
 * @author Tilman Hausherr
 */
public class DrawPrintTextLocations extends PDFTextStripper {
    private AffineTransform flipAT;
    private AffineTransform rotateAT;
    private AffineTransform transAT;

    private final float DPI = 200.0f;
    private final double PT2PX = DPI / 72.0;
    private final AffineTransform dpiAT = AffineTransform.getScaleInstance(PT2PX, PT2PX);

    private final String filename;
    static final int SCALE = 1;
    private Graphics2D g2d;
    private final PDDocument document;

    /**
     * Instantiate a new PDFTextStripper object.
     *
     * @param document
     * @param filename
     * @throws IOException If there is an error loading the properties.
     */
    public DrawPrintTextLocations(PDDocument document, String filename) throws IOException {
        this.document = document;
        this.filename = filename;
    }

    /**
     * This will print the documents data.
     *
     * @param args The command line arguments.
     * @throws IOException If there is an error parsing the document.
     */
    public static void main(String[] args) throws IOException {
        String pdfLoc = "/debug/pdfbox/p2_VS008PI.pdf";

        if (args.length == 1) {
            pdfLoc = args[0];
        }

        try (PDDocument document = PDDocument.load(new File(pdfLoc))) {
            DrawPrintTextLocations stripper = new DrawPrintTextLocations(document, pdfLoc);
            stripper.setSortByPosition(true);

            for (int page = 0; page < document.getNumberOfPages(); ++page) {
                stripper.stripPage(page);
            }
        }
    }

    private void stripPage(int page) throws IOException {
        PDFRenderer pdfRenderer = new PDFRenderer(document);
        BufferedImage image = pdfRenderer.renderImageWithDPI(page, DPI);

        PDPage pdPage = document.getPage(page);
        PDRectangle cropBox = pdPage.getCropBox();

        // flip y-axis
        flipAT = new AffineTransform();
        flipAT.translate(0, pdPage.getBBox().getHeight());
        flipAT.scale(1, -1);

        // page may be rotated
        rotateAT = new AffineTransform();
        int rotation = pdPage.getRotation();
        if (rotation != 0) {
            PDRectangle mediaBox = pdPage.getMediaBox();
            switch (rotation) {
                case 90:
                    rotateAT.translate(mediaBox.getHeight(), 0);
                    break;
                case 270:
                    rotateAT.translate(0, mediaBox.getWidth());
                    break;
                case 180:
                    rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
                    break;
                default:
                    break;
            }
            rotateAT.rotate(Math.toRadians(rotation));
        }

        // cropbox
        transAT = AffineTransform.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY());

        g2d = image.createGraphics();
        g2d.setStroke(new BasicStroke(0.1f));
        g2d.scale(SCALE, SCALE);

        setStartPage(page + 1);
        setEndPage(page + 1);

        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        writeText(document, dummy);

        g2d.dispose();

        String imageFilename = filename;
        int pt = imageFilename.lastIndexOf('.');
        imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png";
        ImageIO.write(image, "png", new File(imageFilename));
    }

    /**
     * Override the default functionality of PDFTextStripper.
     */
    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {

        for (TextPosition text : textPositions) {

            AffineTransform at = text.getTextMatrix().createAffineTransform();
            PDFont font = text.getFont();

            BoundingBox bbox = font.getBoundingBox();

            float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
            Rectangle2D.Float rect1 = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());

            if (font instanceof PDType3Font) {
                at.concatenate(font.getFontMatrix().createAffineTransform());
            } else {
                at.scale(1 / 1000f, 1 / 1000f);
            }

            Shape s1 = at.createTransformedShape(rect1);
            s1 = flipAT.createTransformedShape(s1);
            s1 = rotateAT.createTransformedShape(s1);
            s1 = dpiAT.createTransformedShape(s1);

            g2d.setColor(Color.blue);
            g2d.draw(s1);

            Rectangle bounds = s1.getBounds();
            if (bounds.getX() < 0 || bounds.getY() < 0) {
                // THIS is where things go wrong
                // i need these coordinates to be +ve
                System.out.println(bounds.toString());
                System.out.println(rect1.toString());
            }
        }
    }
}

这里是上述 pdf 第一页输出的一些片段。

SECTION 10 – INSURANCE & OTHER FINANCIAL RESOURCES java.awt.Rectangle[x=-3237,y=40,width=19,height=43] java.awt.Rectangle[x=-3216,y=40,width=20,height=43] java.awt.Rectangle[x=-3194,y=40,width=23,height=43] java.awt.Rectangle[x=-3170,y=40,width=22,height=43]

负坐标的字符在裁剪框外(坐标大于裁剪框高度/宽度的字符)。将 cropbox 看作是从更大的东西上剪下来的。要查看全部内容,运行 此代码

pdPage.setCropBox(pdPage.getMediaBox());

PDF 的每一页,然后保存并查看。

根据您的评论

Following your advice of setting the crop box to the media box, actually changed the whole on screen appearance of the pdf, now i got 3 pages collated as one.

这表明,从物理角度看,这是一本折叠的 sheet,每面有 3 页。在线 PDF 将其显示为 6 页,以便于在计算机上查看。