在 java 中使用 Apache Tika 从 pdf 文件中提取文本
Extract text from a pdf file using Apache Tika in java
try {
File file = new File("Example.pdf");
String content = new Tika().parseToString(file);
System.out.println("The Content: " + content);
} catch (Exception e) {
e.printStackTrace();
}
我导入了java.io.File
和导入了org.apache.tika.Tika
;
但是在 运行 这段代码中我得到了这样的错误:
Exception in thread "main" java.lang.NoSuchMethodError: org.slf4j.spi.LocationAwareLogger.log(Lorg/slf4j/Marker;Ljava/lang/String;ILjava/lang/String;Ljava/lang/Throwable;)V
at org.apache.commons.logging.impl.SLF4JLocationAwareLog.warn(SLF4JLocationAwareLog.java:162)
at org.apache.pdfbox.pdmodel.font.FileSystemFontProvider.loadDiskCache(FileSystemFontProvider.java:461)
at org.apache.pdfbox.pdmodel.font.FileSystemFontProvider.(FileSystemFontProvider.java:217)
at org.apache.pdfbox.pdmodel.font.FontMapperImpl$DefaultFontProvider.(FontMapperImpl.java:130)
at org.apache.pdfbox.pdmodel.font.FontMapperImpl.getProvider(FontMapperImpl.java:149)
at org.apache.pdfbox.pdmodel.font.FontMapperImpl.findFont(FontMapperImpl.java:413)
at org.apache.pdfbox.pdmodel.font.FontMapperImpl.findFontBoxFont(FontMapperImpl.java:376)
at org.apache.pdfbox.pdmodel.font.FontMapperImpl.getFontBoxFont(FontMapperImpl.java:350)
at org.apache.pdfbox.pdmodel.font.PDType1Font.(PDType1Font.java:146)
at org.apache.pdfbox.pdmodel.font.PDType1Font.(PDType1Font.java:79)
at org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:62)
at org.apache.pdfbox.pdmodel.PDResources.getFont(PDResources.java:143)
at org.apache.pdfbox.contentstream.operator.text.SetFontAndSize.process(SetFontAndSize.java:60)
at org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:838)
at org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:495)
at org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:469)
at org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:150)
at org.apache.pdfbox.text.LegacyPDFStreamEngine.processPage(LegacyPDFStreamEngine.java:139)
at org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:391)
at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:147)
at org.apache.pdfbox.text.PDFTextStripper.processPages(PDFTextStripper.java:319)
at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:266)
at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:117)
at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:167)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
at org.apache.tika.Tika.parseToString(Tika.java:527)
at org.apache.tika.Tika.parseToString(Tika.java:642)
at java_programs.PdfParse.main(PdfParse.java:22)
以下似乎对我有用。
我得到了我想要的字符串,但我也在控制台中打印了一些警告。
[在 Windows] 我编译 运行 它是这样的:
javac -cp .;tika-app-1.16.jar Test.java
java -cp .;tika-app-1.16.jar Test
你用的是什么提卡罐?
我添加了另一种方法 (tikaPdfTest()
) 来展示从 PDF 中获取文本的不同方法,这可能对您有所帮助。
import java.io.File;
import org.apache.tika.Tika;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.Tika;
import org.xml.sax.SAXException;
public class Test {
public static void main(final String[] args) {
//Your way
try {
File file = new File("Example.pdf");
String content = new Tika().parseToString(file);
System.out.println("The Content: " + content);
} catch (final Exception e) {
e.printStackTrace();
}
//Another way
try {
System.out.println("The contents:\t[" + Test.tikaPdfTest("Example.pdf") + "]");
} catch (final Exception e) {
e.printStackTrace();
}
}
public static String tikaPdfTest(final String fileName) throws IOException, SAXException, TikaException {
try(final FileInputStream inputstream = new FileInputStream(new File(fileName))) {
final BodyContentHandler handler = new BodyContentHandler();
new PDFParser().parse(inputstream, handler, new Metadata(), new ParseContext());
return handler.toString().trim();
}
}
}
try {
File file = new File("Example.pdf");
String content = new Tika().parseToString(file);
System.out.println("The Content: " + content);
} catch (Exception e) {
e.printStackTrace();
}
我导入了java.io.File
和导入了org.apache.tika.Tika
;
但是在 运行 这段代码中我得到了这样的错误:
Exception in thread "main" java.lang.NoSuchMethodError: org.slf4j.spi.LocationAwareLogger.log(Lorg/slf4j/Marker;Ljava/lang/String;ILjava/lang/String;Ljava/lang/Throwable;)V at org.apache.commons.logging.impl.SLF4JLocationAwareLog.warn(SLF4JLocationAwareLog.java:162) at org.apache.pdfbox.pdmodel.font.FileSystemFontProvider.loadDiskCache(FileSystemFontProvider.java:461) at org.apache.pdfbox.pdmodel.font.FileSystemFontProvider.(FileSystemFontProvider.java:217) at org.apache.pdfbox.pdmodel.font.FontMapperImpl$DefaultFontProvider.(FontMapperImpl.java:130) at org.apache.pdfbox.pdmodel.font.FontMapperImpl.getProvider(FontMapperImpl.java:149) at org.apache.pdfbox.pdmodel.font.FontMapperImpl.findFont(FontMapperImpl.java:413) at org.apache.pdfbox.pdmodel.font.FontMapperImpl.findFontBoxFont(FontMapperImpl.java:376) at org.apache.pdfbox.pdmodel.font.FontMapperImpl.getFontBoxFont(FontMapperImpl.java:350) at org.apache.pdfbox.pdmodel.font.PDType1Font.(PDType1Font.java:146) at org.apache.pdfbox.pdmodel.font.PDType1Font.(PDType1Font.java:79) at org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:62) at org.apache.pdfbox.pdmodel.PDResources.getFont(PDResources.java:143) at org.apache.pdfbox.contentstream.operator.text.SetFontAndSize.process(SetFontAndSize.java:60) at org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:838) at org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:495) at org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:469) at org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:150) at org.apache.pdfbox.text.LegacyPDFStreamEngine.processPage(LegacyPDFStreamEngine.java:139) at org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:391) at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:147) at org.apache.pdfbox.text.PDFTextStripper.processPages(PDFTextStripper.java:319) at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:266) at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:117) at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:167) at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280) at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280) at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135) at org.apache.tika.Tika.parseToString(Tika.java:527) at org.apache.tika.Tika.parseToString(Tika.java:642) at java_programs.PdfParse.main(PdfParse.java:22)
以下似乎对我有用。 我得到了我想要的字符串,但我也在控制台中打印了一些警告。
[在 Windows] 我编译 运行 它是这样的:
javac -cp .;tika-app-1.16.jar Test.java
java -cp .;tika-app-1.16.jar Test
你用的是什么提卡罐?
我添加了另一种方法 (tikaPdfTest()
) 来展示从 PDF 中获取文本的不同方法,这可能对您有所帮助。
import java.io.File;
import org.apache.tika.Tika;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.Tika;
import org.xml.sax.SAXException;
public class Test {
public static void main(final String[] args) {
//Your way
try {
File file = new File("Example.pdf");
String content = new Tika().parseToString(file);
System.out.println("The Content: " + content);
} catch (final Exception e) {
e.printStackTrace();
}
//Another way
try {
System.out.println("The contents:\t[" + Test.tikaPdfTest("Example.pdf") + "]");
} catch (final Exception e) {
e.printStackTrace();
}
}
public static String tikaPdfTest(final String fileName) throws IOException, SAXException, TikaException {
try(final FileInputStream inputstream = new FileInputStream(new File(fileName))) {
final BodyContentHandler handler = new BodyContentHandler();
new PDFParser().parse(inputstream, handler, new Metadata(), new ParseContext());
return handler.toString().trim();
}
}
}