使用 PDFBox 从单个 PDF 页面中提取多个嵌入图像

Extract Multiple Embedded Images from a single PDF Page using PDFBox

朋友们,我用的是PDFBox 2.0.6。我已经成功地从 pdf 文件中提取图像,但现在它正在为单个 pdf 页面创建图像。但问题是可以没有。 pdf 页面中的图像,我希望每个嵌入的图像本身都应提取为单个图像。

这是代码,

import java.awt.image.BufferedImage;
import java.io.File;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;

public class DemoPdf {

    public static void main(String args[]) throws Exception {
        //Loading an existing PDF document
        File file = new File("C:/Users/ADMIN/Downloads/Vehicle_Photographs.pdf");
        PDDocument document = PDDocument.load(file);
        //Instantiating the PDFRenderer class
        PDFRenderer renderer = new PDFRenderer(document);
        File imageFolder = new File("C:/Users/ADMIN/Desktop/image");

        for (int page = 0; page < document.getNumberOfPages(); ++page) {
            //Rendering an image from the PDF document
            BufferedImage image = renderer.renderImage(page);
            //Writing the image to a file
            ImageIO.write(image, "JPEG", new File(imageFolder+"/" + page +".jpg"));
            System.out.println("Image created"+ page);
        }
        //Closing the document
        document.close();
    }

}   

我是否可以在 PDFBox 中将所有嵌入的图像提取为单独的图像,谢谢

是的。可以从pdf中的所有页面中提取所有图像。

你可以参考这个 link, extract images from pdf using PDFBox.

这里的基本思想是,用 PDFStreamEngine 扩展 class,并覆盖 processOperator 方法。为所有页面调用 PDFStreamEngine.processPage。如果传递给 processOperator 的对象是一个 Image 对象,则从该对象中获取 BufferedImage,并保存它。

扩展 PDFStreamEngine 并覆盖 processOperator 类似

 @Override
protected void processOperator( Operator operator, List<COSBase> operands) throws IOException
{
    String operation = operator.getName();
    if( "Do".equals(operation) )
    {
        COSName objectName = (COSName) operands.get( 0 );
        PDXObject xobject = getResources().getXObject( objectName );
        if( xobject instanceof PDImageXObject)
        {
            PDImageXObject image = (PDImageXObject)xobject;
            int imageWidth = image.getWidth();
            int imageHeight = image.getHeight();

            // same image to local
            BufferedImage bImage = new BufferedImage(imageWidth,imageHeight,BufferedImage.TYPE_INT_ARGB);
            bImage = image.getImage();
            ImageIO.write(bImage,"PNG",new File("c:\temp\image_"+imageNumber+".png"));

            imageNumber++;

        }
        else 
        {

        }
    }
    else
    {
        super.processOperator( operator, operands);
    }
}

这个答案与@jprism 相似。但这适用于只想复制并粘贴此准备好使用的演示代码的人。

import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;

public class ExtractImagesUseCase extends PDFStreamEngine{
    private final String filePath;
    private final String outputDir;

    // Constructor
    public ExtractImagesUseCase(String filePath,
                                String outputDir){
        this.filePath = filePath;
        this.outputDir = outputDir;
    }

    // Execute
    public void execute(){
        try{
            File file = new File(filePath);
            PDDocument document = PDDocument.load(file);

            for(PDPage page : document.getPages()){
                processPage(page);
            }

        }catch(IOException e){
            e.printStackTrace();
        }
    }

    @Override
    protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
        String operation = operator.getName();

        if("Do".equals(operation)){
            COSName objectName = (COSName) operands.get(0);
            PDXObject pdxObject = getResources().getXObject(objectName);

            if(pdxObject instanceof PDImageXObject){
                // Image
                PDImageXObject image = (PDImageXObject) pdxObject;
                BufferedImage bImage = image.getImage();

                // File
                String randomName = UUID.randomUUID().toString();
                File outputFile = new File(outputDir,randomName + ".png");

                // Write image to file
                ImageIO.write(bImage, "PNG", outputFile);

            }else if(pdxObject instanceof PDFormXObject){
                PDFormXObject form = (PDFormXObject) pdxObject;
                showForm(form);
            }
        }

        else super.processOperator(operator, operands);
    }
}

演示

public class ExtractImageDemo{
    public static void main(String[] args){
        String filePath = "C:\Users\John\Downloads\Documents\sample-file.pdf";
        String outputDir = "C:\Users\John\Downloads\Documents\Output";

        ExtractImagesUseCase useCase = new ExtractImagesUseCase(
                filePath,
                outputDir
        );
        useCase.execute();
    }
}