tika PackageParser 不适用于目录
tika PackageParser does not work with directories
我正在写一个 class 以递归地从 zip 文件中提取文件并将它们生成到 Kafka 队列以供进一步处理。我的目的是能够从多个级别的 zip 中提取文件。下面的代码是我执行此操作的 tika ContainerExtractor 的实现。
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pkg.PackageParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class UberContainerExtractor implements ContainerExtractor {
/**
*
*/
private static final long serialVersionUID = -6636138154366178135L;
// statically populate SUPPORTED_TYPES
static {
Set<MediaType> supportedTypes = new HashSet<MediaType>();
ParseContext context = new ParseContext();
supportedTypes.addAll(new PackageParser().getSupportedTypes(context));
SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes);
}
/**
* A stack that maintains the parent filenames for the recursion
*/
Stack<String> parentFileNames = new Stack<String>();
/**
* The default tika parser
*/
private final Parser parser;
/**
* Default tika detector
*/
private final Detector detector;
/**
* The supported container types into which we can recurse
*/
public final static Set<MediaType> SUPPORTED_TYPES;
/**
* The number of documents recursively extracted from the container and its
* children containers if present
*/
int extracted;
public UberContainerExtractor() {
this(TikaConfig.getDefaultConfig());
}
public UberContainerExtractor(TikaConfig config) {
this(new DefaultDetector(config.getMimeRepository()));
}
public UberContainerExtractor(Detector detector) {
this.parser = new AutoDetectParser(new PackageParser());
this.detector = detector;
}
public boolean isSupported(TikaInputStream input) throws IOException {
MediaType type = detector.detect(input, new Metadata());
return SUPPORTED_TYPES.contains(type);
}
@Override
public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler)
throws IOException, TikaException {
ParseContext context = new ParseContext();
context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
try {
Metadata metadata = new Metadata();
parser.parse(stream, new DefaultHandler(), metadata, context);
} catch (SAXException e) {
throw new TikaException("Unexpected SAX exception", e);
}
}
private class RecursiveParser extends AbstractParser {
/**
*
*/
private static final long serialVersionUID = -7260171956667273262L;
private final ContainerExtractor extractor;
private final EmbeddedResourceHandler handler;
private RecursiveParser(ContainerExtractor extractor, EmbeddedResourceHandler handler) {
this.extractor = extractor;
this.handler = handler;
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return parser.getSupportedTypes(context);
}
public void parse(InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
// Figure out what we have to process
String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
MediaType type = detector.detect(tis, metadata);
if (extractor == null) {
// do nothing
} else {
// Use a temporary file to process the stream
File file = tis.getFile();
System.out.println("file is directory = " + file.isDirectory());
// Recurse and extract if the filetype is supported
if (SUPPORTED_TYPES.contains(type)) {
System.out.println("encountered a supported file:" + filename);
parentFileNames.push(filename);
extractor.extract(tis, extractor, handler);
parentFileNames.pop();
} else { // produce the file
List<String> parentFilenamesList = new ArrayList<String>(parentFileNames);
parentFilenamesList.add(filename);
String originalFilepath = StringUtils.join(parentFilenamesList, "/");
System.out.println("producing " + filename + " with originalFilepath:" + originalFilepath
+ " to kafka queue");
++extracted;
}
}
} finally {
tmp.dispose();
}
}
}
public int getExtracted() {
return extracted;
}
public static void main(String[] args) throws IOException, TikaException {
String filename = "/Users/rohit/Data/cd.zip";
File file = new File(filename);
TikaInputStream stream = TikaInputStream.get(file);
ContainerExtractor recursiveExtractor = new UberContainerExtractor();
EmbeddedResourceHandler resourceHandler = new EmbeddedResourceHandler() {
@Override
public void handle(String filename, MediaType mediaType, InputStream stream) {
// do nothing
}
};
recursiveExtractor.extract(stream, recursiveExtractor, resourceHandler);
stream.close();
System.out.println("extracted " + ((UberContainerExtractor) recursiveExtractor).getExtracted() + " files");
}
}
它适用于多级 zip,只要 zip 中的文件是平面结构。例如。
cd.zip
- c.txt
- d.txt
如果 zip 中的文件存在于目录中,则代码不起作用。例如。
ab.zip
- AB /
- a.txt
- b.txt
在调试时,我在 PackageParser
中遇到了以下代码片段
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, xhtml);
}
entry = ais.getNextEntry();
}
} finally {
ais.close();
}
我试图注释掉 if 条件,但它不起作用。有没有理由对此发表评论?有什么办法可以解决这个问题吗?
我使用的是tika 1.6版本
以相反的顺序解决您的问题:
Is there a reason why this is commented?
zip 文件中的条目是目录或文件。如果是文件,它们包括它们来自的目录的名称。因此,Tika 不需要对目录做任何事情,它需要做的就是在嵌入文件出现时处理它们
The code does not work if there the files in the zip are present inside a directory. for ex. ab.zip - ab/ - a.txt - b.txt
那你好像做错了什么。 Tika 的递归和包解析器可以很好地处理其中包含文件夹的 zip!
为了证明这一点,从一个像这样的 zip 文件开始:
$ unzip -l ../tt.zip
Archive: ../tt.zip
Length Date Time Name
--------- ---------- ----- ----
0 2015-02-03 16:42 t/
0 2015-02-03 16:42 t/t2/
0 2015-02-03 16:42 t/t2/t3/
164404 2015-02-03 16:42 t/t2/t3/test.jpg
--------- -------
164404 4 files
现在,让我们使用 Tika 应用程序的 -z
提取标志,这会导致 Tika 提取文件的所有嵌入内容。 运行 这样,我们得到
$ java -jar tika-app-1.7.jar -z ../tt.zip
Extracting 't/t2/t3/test.jpg' (image/jpeg) to ./t/t2/t3/test.jpg
然后列出生成的目录,我们看到
$ find . -type f
./t/t2/t3/Test.jpg
我看不出你的代码有什么问题,但遗憾的是我们已经证明问题就在那里,而不是 Tika ......你最好回顾一下递归的各种例子Tika 提供,例如 Tika App tool and the Recursing Parser Wrapper,然后根据那些
重新编写您的代码,使其变得简单
我正在写一个 class 以递归地从 zip 文件中提取文件并将它们生成到 Kafka 队列以供进一步处理。我的目的是能够从多个级别的 zip 中提取文件。下面的代码是我执行此操作的 tika ContainerExtractor 的实现。
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pkg.PackageParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class UberContainerExtractor implements ContainerExtractor {
/**
*
*/
private static final long serialVersionUID = -6636138154366178135L;
// statically populate SUPPORTED_TYPES
static {
Set<MediaType> supportedTypes = new HashSet<MediaType>();
ParseContext context = new ParseContext();
supportedTypes.addAll(new PackageParser().getSupportedTypes(context));
SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes);
}
/**
* A stack that maintains the parent filenames for the recursion
*/
Stack<String> parentFileNames = new Stack<String>();
/**
* The default tika parser
*/
private final Parser parser;
/**
* Default tika detector
*/
private final Detector detector;
/**
* The supported container types into which we can recurse
*/
public final static Set<MediaType> SUPPORTED_TYPES;
/**
* The number of documents recursively extracted from the container and its
* children containers if present
*/
int extracted;
public UberContainerExtractor() {
this(TikaConfig.getDefaultConfig());
}
public UberContainerExtractor(TikaConfig config) {
this(new DefaultDetector(config.getMimeRepository()));
}
public UberContainerExtractor(Detector detector) {
this.parser = new AutoDetectParser(new PackageParser());
this.detector = detector;
}
public boolean isSupported(TikaInputStream input) throws IOException {
MediaType type = detector.detect(input, new Metadata());
return SUPPORTED_TYPES.contains(type);
}
@Override
public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler)
throws IOException, TikaException {
ParseContext context = new ParseContext();
context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
try {
Metadata metadata = new Metadata();
parser.parse(stream, new DefaultHandler(), metadata, context);
} catch (SAXException e) {
throw new TikaException("Unexpected SAX exception", e);
}
}
private class RecursiveParser extends AbstractParser {
/**
*
*/
private static final long serialVersionUID = -7260171956667273262L;
private final ContainerExtractor extractor;
private final EmbeddedResourceHandler handler;
private RecursiveParser(ContainerExtractor extractor, EmbeddedResourceHandler handler) {
this.extractor = extractor;
this.handler = handler;
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return parser.getSupportedTypes(context);
}
public void parse(InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
// Figure out what we have to process
String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
MediaType type = detector.detect(tis, metadata);
if (extractor == null) {
// do nothing
} else {
// Use a temporary file to process the stream
File file = tis.getFile();
System.out.println("file is directory = " + file.isDirectory());
// Recurse and extract if the filetype is supported
if (SUPPORTED_TYPES.contains(type)) {
System.out.println("encountered a supported file:" + filename);
parentFileNames.push(filename);
extractor.extract(tis, extractor, handler);
parentFileNames.pop();
} else { // produce the file
List<String> parentFilenamesList = new ArrayList<String>(parentFileNames);
parentFilenamesList.add(filename);
String originalFilepath = StringUtils.join(parentFilenamesList, "/");
System.out.println("producing " + filename + " with originalFilepath:" + originalFilepath
+ " to kafka queue");
++extracted;
}
}
} finally {
tmp.dispose();
}
}
}
public int getExtracted() {
return extracted;
}
public static void main(String[] args) throws IOException, TikaException {
String filename = "/Users/rohit/Data/cd.zip";
File file = new File(filename);
TikaInputStream stream = TikaInputStream.get(file);
ContainerExtractor recursiveExtractor = new UberContainerExtractor();
EmbeddedResourceHandler resourceHandler = new EmbeddedResourceHandler() {
@Override
public void handle(String filename, MediaType mediaType, InputStream stream) {
// do nothing
}
};
recursiveExtractor.extract(stream, recursiveExtractor, resourceHandler);
stream.close();
System.out.println("extracted " + ((UberContainerExtractor) recursiveExtractor).getExtracted() + " files");
}
}
它适用于多级 zip,只要 zip 中的文件是平面结构。例如。 cd.zip - c.txt - d.txt
如果 zip 中的文件存在于目录中,则代码不起作用。例如。 ab.zip - AB / - a.txt - b.txt
在调试时,我在 PackageParser
中遇到了以下代码片段try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, xhtml);
}
entry = ais.getNextEntry();
}
} finally {
ais.close();
}
我试图注释掉 if 条件,但它不起作用。有没有理由对此发表评论?有什么办法可以解决这个问题吗?
我使用的是tika 1.6版本
以相反的顺序解决您的问题:
Is there a reason why this is commented?
zip 文件中的条目是目录或文件。如果是文件,它们包括它们来自的目录的名称。因此,Tika 不需要对目录做任何事情,它需要做的就是在嵌入文件出现时处理它们
The code does not work if there the files in the zip are present inside a directory. for ex. ab.zip - ab/ - a.txt - b.txt
那你好像做错了什么。 Tika 的递归和包解析器可以很好地处理其中包含文件夹的 zip!
为了证明这一点,从一个像这样的 zip 文件开始:
$ unzip -l ../tt.zip
Archive: ../tt.zip
Length Date Time Name
--------- ---------- ----- ----
0 2015-02-03 16:42 t/
0 2015-02-03 16:42 t/t2/
0 2015-02-03 16:42 t/t2/t3/
164404 2015-02-03 16:42 t/t2/t3/test.jpg
--------- -------
164404 4 files
现在,让我们使用 Tika 应用程序的 -z
提取标志,这会导致 Tika 提取文件的所有嵌入内容。 运行 这样,我们得到
$ java -jar tika-app-1.7.jar -z ../tt.zip
Extracting 't/t2/t3/test.jpg' (image/jpeg) to ./t/t2/t3/test.jpg
然后列出生成的目录,我们看到
$ find . -type f
./t/t2/t3/Test.jpg
我看不出你的代码有什么问题,但遗憾的是我们已经证明问题就在那里,而不是 Tika ......你最好回顾一下递归的各种例子Tika 提供,例如 Tika App tool and the Recursing Parser Wrapper,然后根据那些
重新编写您的代码,使其变得简单