使用 Apache POI 提取 excel 文本类型附件编码问题

Using Apache POI to extract excel text type attachment encode issue

我现在正在使用 Apache POI 从 Excel 文件中提取附件,这是我的部分代码。

    Sheet sheetAt = workbook.getSheet(sheetName);
    Drawing<?> drawingPatriarch = sheetAt.getDrawingPatriarch();
    if (drawingPatriarch != null) {
        Iterator<?> iterator = drawingPatriarch.iterator();
        if (iterator.hasNext()) {
            Object next = iterator.next();
            if (next instanceof ObjectData) {
                ObjectData o = (ObjectData) next;
                IOUtil.write(o.getObjectData(), outputPath);
            } else if (next instanceof Picture) {
                Picture o = (Picture) next;
                IOUtil.write(o.getData(), outputPath);
            } 
        }
    }

当附件为二进制类型,如exe、png等时,这种方式提取的文件是正常的,但如果附件为文本类型,如txt、pdf等,则解压出来的文件无法正常打开,查看二进制内容,除了原文件还有很多多余的数据,请问如何解析这些数据。

我怀疑你的 ObjectDataoleObjectObjekt-Manager-Shellobjekt 类型。这些对象存储在嵌入式 oleObject*.bin 文件中。这些文件有自己的文件系统,需要读取。为此,首先获取 DirectoryEntryDirectoryNode,然后获取 Ole10Native。有了它,您可以获得文件数据 byte[] data = ole10Native.getDataBuffer().

完整示例:

import java.io.FileInputStream;
import java.io.FileOutputStream;

import org.apache.poi.ss.usermodel.*;
import org.apache.poi.poifs.filesystem.*;

public class ExcelGetObjectData {
    
 public static void main(String[] args) throws Exception {

  //String inFilePath = "./ExcelExampleIn.xlsx"; String outFilePath = "./ExcelExampleOut.xlsx";
  String inFilePath = "./ExcelExampleIn.xls"; String outFilePath = "./ExcelExampleOut.xls";
  
  try (Workbook workbook = WorkbookFactory.create(new FileInputStream(inFilePath));
       FileOutputStream out = new FileOutputStream(outFilePath ) ) {

   Sheet sheet = workbook.getSheetAt(0);
   Drawing<?> drawingPatriarch = sheet.getDrawingPatriarch();
   if (drawingPatriarch != null) {
    for (Shape shape : drawingPatriarch) {
     System.out.println(shape);
     if (shape instanceof ObjectData) {
      ObjectData objectData = (ObjectData) shape;
      System.out.println(objectData.getFileName());
      System.out.println(objectData.getOLE2ClassName());
      System.out.println(objectData.getContentType());
      if(objectData.getOLE2ClassName().equals("Objekt-Manager-Shellobjekt")) {
       if (objectData.hasDirectoryEntry()) {
        DirectoryEntry directory = objectData.getDirectory();
        if (directory instanceof DirectoryNode) {
         DirectoryNode directoryNode = (DirectoryNode)directory;
         Ole10Native ole10Native = Ole10Native.createFromEmbeddedOleObject(directoryNode);
         System.out.println(ole10Native.getCommand());
         System.out.println(ole10Native.getFileName());
         System.out.println(ole10Native.getLabel());
         byte[] data = ole10Native.getDataBuffer(); //data now contains the embedded data
         try (FileOutputStream dataOut = new FileOutputStream("./" + ole10Native.getLabel())) {
          dataOut.write(data);
         }
        }
       }
      } else if(objectData.getOLE2ClassName().equals("...")) {
       //...
      }          
     } else if (shape instanceof /*other*/Object) {
      //...
     }
    }        
   }
   
   workbook.write(out);
  }
 }
}

使用 EmbeddedExtractor 提取所有嵌入对象可以像这样完成:

import java.io.FileInputStream;
import java.io.FileOutputStream;

import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.extractor.EmbeddedExtractor;
import org.apache.poi.ss.extractor.EmbeddedData;

public class ExcelEmbeddedExtractor {
    
 public static void main(String[] args) throws Exception {

  String inFilePath = "./ExcelExampleIn.xlsx"; String outFilePath = "./ExcelExampleOut.xlsx";
  //String inFilePath = "./ExcelExampleIn.xls"; String outFilePath = "./ExcelExampleOut.xls";
  
  try (Workbook workbook = WorkbookFactory.create(new FileInputStream(inFilePath));
       FileOutputStream out = new FileOutputStream(outFilePath ) ) {

   Sheet sheet = workbook.getSheetAt(0);
   EmbeddedExtractor extractor = new EmbeddedExtractor();
   for (EmbeddedData embeddedData : extractor.extractAll(sheet)) {
    Shape shape = embeddedData.getShape();
    System.out.println(shape);    
    String filename = embeddedData.getFilename();
    System.out.println(filename);   
    byte[] data = embeddedData.getEmbeddedData(); //data now contains the embedded data
    try (FileOutputStream dataOut = new FileOutputStream("./" + filename)) {
     dataOut.write(data);
    }    
   }
   
   workbook.write(out);
  }
 }
}