使用 PDFBox 列出 pdf 附件 (Java)
List pdf Attachments using PDFBox (Java)
我想获取 PDF 文档attachments/embedded 个文件的所有文件名。我已经搜索了很长时间,但我的代码仍然无法正常工作。
我尝试了什么:
File input = new File(inputfile); // Input File Path, Given as param from args[]
pd = PDDocument.load(input);
PDDocumentNameDictionary names = new PDDocumentNameDictionary(pd.getDocumentCatalog());
PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
Map<String, COSObjectable> existedNames = efTree.getNames();
System.out.println(existedNames);//Print Embedded-Filenames to console
pd.close();
我不知道是否可以将 MAP 的内容打印到控制台。我在 eclipse 中编码,它不会给我任何错误。但是当我 运行 我总是得到 jar 文件时: NullPointerException at org.apache.pdfbox.pdmodel.PDDocument.getDocumentCatalog(PDDocument.java:778)
有什么想法或帮助吗?非常感谢...
这里是源代码下载的ExtractEmbeddedFiles example:
public final class ExtractEmbeddedFiles
{
private ExtractEmbeddedFiles()
{
}
/**
* This is the main method.
*
* @param args The command line arguments.
*
* @throws IOException If there is an error parsing the document.
*/
public static void main( String[] args ) throws IOException
{
if( args.length != 1 )
{
usage();
System.exit(1);
}
else
{
PDDocument document = null;
try
{
File pdfFile = new File(args[0]);
String filePath = pdfFile.getParent() + System.getProperty("file.separator");
document = PDDocument.load(pdfFile );
PDDocumentNameDictionary namesDictionary =
new PDDocumentNameDictionary( document.getDocumentCatalog() );
PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
if (efTree != null)
{
Map<String, PDComplexFileSpecification> names = efTree.getNames();
if (names != null)
{
extractFiles(names, filePath);
}
else
{
List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
for (PDNameTreeNode<PDComplexFileSpecification> node : kids)
{
names = node.getNames();
extractFiles(names, filePath);
}
}
}
// extract files from annotations
for (PDPage page : document.getPages())
{
for (PDAnnotation annotation : page.getAnnotations())
{
if (annotation instanceof PDAnnotationFileAttachment)
{
PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment.getFile();
PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
extractFile(filePath, fileSpec.getFilename(), embeddedFile);
}
}
}
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
private static void extractFiles(Map<String, PDComplexFileSpecification> names, String filePath)
throws IOException
{
for (Entry<String, PDComplexFileSpecification> entry : names.entrySet())
{
String filename = entry.getKey();
PDComplexFileSpecification fileSpec = entry.getValue();
PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
extractFile(filePath, filename, embeddedFile);
}
}
private static void extractFile(String filePath, String filename, PDEmbeddedFile embeddedFile)
throws IOException
{
String embeddedFilename = filePath + filename;
File file = new File(filePath + filename);
System.out.println("Writing " + embeddedFilename);
FileOutputStream fos = null;
try
{
fos = new FileOutputStream(file);
fos.write(embeddedFile.toByteArray());
}
finally
{
IOUtils.closeQuietly(fos);
}
}
private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec )
{
// search for the first available alternative of the embedded file
PDEmbeddedFile embeddedFile = null;
if (fileSpec != null)
{
embeddedFile = fileSpec.getEmbeddedFileUnicode();
if (embeddedFile == null)
{
embeddedFile = fileSpec.getEmbeddedFileDos();
}
if (embeddedFile == null)
{
embeddedFile = fileSpec.getEmbeddedFileMac();
}
if (embeddedFile == null)
{
embeddedFile = fileSpec.getEmbeddedFileUnix();
}
if (embeddedFile == null)
{
embeddedFile = fileSpec.getEmbeddedFile();
}
}
return embeddedFile;
}
/**
* This will print the usage for this program.
*/
private static void usage()
{
System.err.println( "Usage: java " + ExtractEmbeddedFiles.class.getName() + " <input-pdf>" );
}
}
终于找到解决办法了。对于遇到同样问题的任何人,以下代码对我有用:
PDDocument pd;
File input = new File(inputfile); // Input File
pd = PDDocument.load(input);
//Writes all embedded Filenames (from pdf document) into Logfile
try{
PDDocumentCatalog catalog = pd.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
//For-Each Loop is used to list all embedded files (if there is more than one)
for (Map.Entry<String, COSObjectable> entry : embeddedFileNames.entrySet())
{
//You might need to configure the logger first
logger.info("Inputfile: " + inputfile +"Found embedded File: " + entry.getKey() + ":");
}
}
catch (Exception e){
System.out.println("Document has no attachments. ");
}
我想获取 PDF 文档attachments/embedded 个文件的所有文件名。我已经搜索了很长时间,但我的代码仍然无法正常工作。
我尝试了什么:
File input = new File(inputfile); // Input File Path, Given as param from args[]
pd = PDDocument.load(input);
PDDocumentNameDictionary names = new PDDocumentNameDictionary(pd.getDocumentCatalog());
PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
Map<String, COSObjectable> existedNames = efTree.getNames();
System.out.println(existedNames);//Print Embedded-Filenames to console
pd.close();
我不知道是否可以将 MAP 的内容打印到控制台。我在 eclipse 中编码,它不会给我任何错误。但是当我 运行 我总是得到 jar 文件时: NullPointerException at org.apache.pdfbox.pdmodel.PDDocument.getDocumentCatalog(PDDocument.java:778)
有什么想法或帮助吗?非常感谢...
这里是源代码下载的ExtractEmbeddedFiles example:
public final class ExtractEmbeddedFiles
{
private ExtractEmbeddedFiles()
{
}
/**
* This is the main method.
*
* @param args The command line arguments.
*
* @throws IOException If there is an error parsing the document.
*/
public static void main( String[] args ) throws IOException
{
if( args.length != 1 )
{
usage();
System.exit(1);
}
else
{
PDDocument document = null;
try
{
File pdfFile = new File(args[0]);
String filePath = pdfFile.getParent() + System.getProperty("file.separator");
document = PDDocument.load(pdfFile );
PDDocumentNameDictionary namesDictionary =
new PDDocumentNameDictionary( document.getDocumentCatalog() );
PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
if (efTree != null)
{
Map<String, PDComplexFileSpecification> names = efTree.getNames();
if (names != null)
{
extractFiles(names, filePath);
}
else
{
List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
for (PDNameTreeNode<PDComplexFileSpecification> node : kids)
{
names = node.getNames();
extractFiles(names, filePath);
}
}
}
// extract files from annotations
for (PDPage page : document.getPages())
{
for (PDAnnotation annotation : page.getAnnotations())
{
if (annotation instanceof PDAnnotationFileAttachment)
{
PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment.getFile();
PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
extractFile(filePath, fileSpec.getFilename(), embeddedFile);
}
}
}
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
private static void extractFiles(Map<String, PDComplexFileSpecification> names, String filePath)
throws IOException
{
for (Entry<String, PDComplexFileSpecification> entry : names.entrySet())
{
String filename = entry.getKey();
PDComplexFileSpecification fileSpec = entry.getValue();
PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
extractFile(filePath, filename, embeddedFile);
}
}
private static void extractFile(String filePath, String filename, PDEmbeddedFile embeddedFile)
throws IOException
{
String embeddedFilename = filePath + filename;
File file = new File(filePath + filename);
System.out.println("Writing " + embeddedFilename);
FileOutputStream fos = null;
try
{
fos = new FileOutputStream(file);
fos.write(embeddedFile.toByteArray());
}
finally
{
IOUtils.closeQuietly(fos);
}
}
private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec )
{
// search for the first available alternative of the embedded file
PDEmbeddedFile embeddedFile = null;
if (fileSpec != null)
{
embeddedFile = fileSpec.getEmbeddedFileUnicode();
if (embeddedFile == null)
{
embeddedFile = fileSpec.getEmbeddedFileDos();
}
if (embeddedFile == null)
{
embeddedFile = fileSpec.getEmbeddedFileMac();
}
if (embeddedFile == null)
{
embeddedFile = fileSpec.getEmbeddedFileUnix();
}
if (embeddedFile == null)
{
embeddedFile = fileSpec.getEmbeddedFile();
}
}
return embeddedFile;
}
/**
* This will print the usage for this program.
*/
private static void usage()
{
System.err.println( "Usage: java " + ExtractEmbeddedFiles.class.getName() + " <input-pdf>" );
}
}
终于找到解决办法了。对于遇到同样问题的任何人,以下代码对我有用:
PDDocument pd;
File input = new File(inputfile); // Input File
pd = PDDocument.load(input);
//Writes all embedded Filenames (from pdf document) into Logfile
try{
PDDocumentCatalog catalog = pd.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
//For-Each Loop is used to list all embedded files (if there is more than one)
for (Map.Entry<String, COSObjectable> entry : embeddedFileNames.entrySet())
{
//You might need to configure the logger first
logger.info("Inputfile: " + inputfile +"Found embedded File: " + entry.getKey() + ":");
}
}
catch (Exception e){
System.out.println("Document has no attachments. ");
}