tika PackageParser 不适用于目录

Question

我正在写一个 class 以递归地从 zip 文件中提取文件并将它们生成到 Kafka 队列以供进一步处理。我的目的是能够从多个级别的 zip 中提取文件。下面的代码是我执行此操作的 tika ContainerExtractor 的实现。

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.lang.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pkg.PackageParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class UberContainerExtractor implements ContainerExtractor {

  /**
   * 
   */
  private static final long serialVersionUID = -6636138154366178135L;

  // statically populate SUPPORTED_TYPES
  static {
    Set<MediaType> supportedTypes = new HashSet<MediaType>();
    ParseContext context = new ParseContext();
    supportedTypes.addAll(new PackageParser().getSupportedTypes(context));
    SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes);
  }

  /**
   * A stack that maintains the parent filenames for the recursion
   */
  Stack<String> parentFileNames = new Stack<String>();
  /**
   * The default tika parser
   */
  private final Parser parser;
  /**
   * Default tika detector
   */
  private final Detector detector;
  /**
   * The supported container types into which we can recurse
   */
  public final static Set<MediaType> SUPPORTED_TYPES;
  /**
   * The number of documents recursively extracted from the container and its
   * children containers if present
   */
  int extracted;

  public UberContainerExtractor() {
    this(TikaConfig.getDefaultConfig());
  }

  public UberContainerExtractor(TikaConfig config) {
    this(new DefaultDetector(config.getMimeRepository()));
  }

  public UberContainerExtractor(Detector detector) {
    this.parser = new AutoDetectParser(new PackageParser());
    this.detector = detector;
  }

  public boolean isSupported(TikaInputStream input) throws IOException {
    MediaType type = detector.detect(input, new Metadata());
    return SUPPORTED_TYPES.contains(type);
  }

  @Override
  public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler)
      throws IOException, TikaException {

    ParseContext context = new ParseContext();
    context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
    try {
      Metadata metadata = new Metadata();
      parser.parse(stream, new DefaultHandler(), metadata, context);
    } catch (SAXException e) {
      throw new TikaException("Unexpected SAX exception", e);
    }
  }

  private class RecursiveParser extends AbstractParser {

    /**
     * 
     */
    private static final long serialVersionUID = -7260171956667273262L;

    private final ContainerExtractor extractor;

    private final EmbeddedResourceHandler handler;

    private RecursiveParser(ContainerExtractor extractor, EmbeddedResourceHandler handler) {
      this.extractor = extractor;
      this.handler = handler;
    }

    public Set<MediaType> getSupportedTypes(ParseContext context) {
      return parser.getSupportedTypes(context);
    }

    public void parse(InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
      TemporaryResources tmp = new TemporaryResources();
      try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);

        // Figure out what we have to process
        String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
        MediaType type = detector.detect(tis, metadata);

        if (extractor == null) {
          // do nothing
        } else {
          // Use a temporary file to process the stream
          File file = tis.getFile();
          System.out.println("file is directory = " + file.isDirectory());

          // Recurse and extract if the filetype is supported
          if (SUPPORTED_TYPES.contains(type)) {
            System.out.println("encountered a supported file:" + filename);
            parentFileNames.push(filename);
            extractor.extract(tis, extractor, handler);
            parentFileNames.pop();
          } else { // produce the file
            List<String> parentFilenamesList = new ArrayList<String>(parentFileNames);
            parentFilenamesList.add(filename);
            String originalFilepath = StringUtils.join(parentFilenamesList, "/");
            System.out.println("producing " + filename + " with originalFilepath:" + originalFilepath
                + " to kafka queue");
            ++extracted;
          }
        }
      } finally {
        tmp.dispose();
      }
    }
  }

  public int getExtracted() {
    return extracted;
  }

  public static void main(String[] args) throws IOException, TikaException {
    String filename = "/Users/rohit/Data/cd.zip";
    File file = new File(filename);
    TikaInputStream stream = TikaInputStream.get(file);

    ContainerExtractor recursiveExtractor = new UberContainerExtractor();

    EmbeddedResourceHandler resourceHandler = new EmbeddedResourceHandler() {
      @Override
      public void handle(String filename, MediaType mediaType, InputStream stream) {
        // do nothing
      }
    };

    recursiveExtractor.extract(stream, recursiveExtractor, resourceHandler);

    stream.close();

    System.out.println("extracted " + ((UberContainerExtractor) recursiveExtractor).getExtracted() + " files");

  }
}

它适用于多级 zip，只要 zip 中的文件是平面结构。例如。 cd.zip - c.txt - d.txt

如果 zip 中的文件存在于目录中，则代码不起作用。例如。 ab.zip - AB / - a.txt - b.txt

在调试时，我在 PackageParser

中遇到了以下代码片段

try {
  ArchiveEntry entry = ais.getNextEntry();
  while (entry != null) {
    if (!entry.isDirectory()) {
        parseEntry(ais, entry, extractor, xhtml);
    }
    entry = ais.getNextEntry();
  }
} finally {
  ais.close();
}

我试图注释掉 if 条件，但它不起作用。有没有理由对此发表评论？有什么办法可以解决这个问题吗？

我使用的是tika 1.6版本

Answer 1

以相反的顺序解决您的问题：

Is there a reason why this is commented?

zip 文件中的条目是目录或文件。如果是文件，它们包括它们来自的目录的名称。因此，Tika 不需要对目录做任何事情，它需要做的就是在嵌入文件出现时处理它们

The code does not work if there the files in the zip are present inside a directory. for ex. ab.zip - ab/ - a.txt - b.txt

那你好像做错了什么。 Tika 的递归和包解析器可以很好地处理其中包含文件夹的 zip！

为了证明这一点，从一个像这样的 zip 文件开始：

$ unzip -l ../tt.zip 
Archive:  ../tt.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  2015-02-03 16:42   t/
        0  2015-02-03 16:42   t/t2/
        0  2015-02-03 16:42   t/t2/t3/
   164404  2015-02-03 16:42   t/t2/t3/test.jpg
---------                     -------
   164404                     4 files

现在，让我们使用 Tika 应用程序的 -z 提取标志，这会导致 Tika 提取文件的所有嵌入内容。运行这样，我们得到

$ java -jar tika-app-1.7.jar -z ../tt.zip 
Extracting 't/t2/t3/test.jpg' (image/jpeg) to ./t/t2/t3/test.jpg

然后列出生成的目录，我们看到

$ find . -type f
./t/t2/t3/Test.jpg

我看不出你的代码有什么问题，但遗憾的是我们已经证明问题就在那里，而不是 Tika ......你最好回顾一下递归的各种例子Tika 提供，例如 Tika App tool and the Recursing Parser Wrapper，然后根据那些

重新编写您的代码，使其变得简单

tika PackageParser 不适用于目录

tika PackageParser does not work with directories

apache-tika