使用 ContentHandler 提取文件内容
Extract contents of a file using ContentHandler
我正在尝试使用 ContentHandler 提取 txt 文件的内容,下面是我的代码,我的文件内容是
Sample content Sample contentSample contentSample contentSample contentSample contentSample contentSample contentSample contentSample contentSample contentSample contentSample
下面的代码没有显示提取的内容,我在这里遗漏了什么?
class Test {
private OutputStream outputstream;
private ParseContext context;
private Detector detector;
private Parser parser;
private Metadata metadata;
private String extractedText;
public Test() {
context = new ParseContext();
detector = new DefaultDetector();
parser = new AutoDetectParser(detector);
context.set(Parser.class, parser);
outputstream = new ByteArrayOutputStream();
metadata = new Metadata();
}
public void process(String filename) throws Exception {
URL url;
File file = new File(filename);
if (file.isFile()) {
url = file.toURI().toURL();
} else {
url = new URL(filename);
}
InputStream input = TikaInputStream.get(url, metadata);
ContentHandler handler = new BodyContentHandler(outputstream);
parser.parse(input, handler, metadata, context);
input.close();
}
public void getString() {
//Get the text into a String object
extractedText = outputstream.toString();
//Do whatever you want with this String object.
System.out.println("extracted text "+extractedText);
}
public static void main(String args[]) throws Exception {
if (args.length == 1) {
Test textExtractor = new Test();
textExtractor.process("D:\docs\sample.txt");
textExtractor.getString();
} else {
throw new Exception();
}
}
}
除了 apache tika-core 之外,添加 apache tika-parsers 依赖项。
我正在尝试使用 ContentHandler 提取 txt 文件的内容,下面是我的代码,我的文件内容是
Sample content Sample contentSample contentSample contentSample contentSample contentSample contentSample contentSample contentSample contentSample contentSample contentSample
下面的代码没有显示提取的内容,我在这里遗漏了什么?
class Test {
private OutputStream outputstream;
private ParseContext context;
private Detector detector;
private Parser parser;
private Metadata metadata;
private String extractedText;
public Test() {
context = new ParseContext();
detector = new DefaultDetector();
parser = new AutoDetectParser(detector);
context.set(Parser.class, parser);
outputstream = new ByteArrayOutputStream();
metadata = new Metadata();
}
public void process(String filename) throws Exception {
URL url;
File file = new File(filename);
if (file.isFile()) {
url = file.toURI().toURL();
} else {
url = new URL(filename);
}
InputStream input = TikaInputStream.get(url, metadata);
ContentHandler handler = new BodyContentHandler(outputstream);
parser.parse(input, handler, metadata, context);
input.close();
}
public void getString() {
//Get the text into a String object
extractedText = outputstream.toString();
//Do whatever you want with this String object.
System.out.println("extracted text "+extractedText);
}
public static void main(String args[]) throws Exception {
if (args.length == 1) {
Test textExtractor = new Test();
textExtractor.process("D:\docs\sample.txt");
textExtractor.getString();
} else {
throw new Exception();
}
}
}
除了 apache tika-core 之外,添加 apache tika-parsers 依赖项。