使用 Apache POI 提取 excel 文本类型附件编码问题
Using Apache POI to extract excel text type attachment encode issue
我现在正在使用 Apache POI 从 Excel 文件中提取附件,这是我的部分代码。
Sheet sheetAt = workbook.getSheet(sheetName);
Drawing<?> drawingPatriarch = sheetAt.getDrawingPatriarch();
if (drawingPatriarch != null) {
Iterator<?> iterator = drawingPatriarch.iterator();
if (iterator.hasNext()) {
Object next = iterator.next();
if (next instanceof ObjectData) {
ObjectData o = (ObjectData) next;
IOUtil.write(o.getObjectData(), outputPath);
} else if (next instanceof Picture) {
Picture o = (Picture) next;
IOUtil.write(o.getData(), outputPath);
}
}
}
当附件为二进制类型,如exe、png等时,这种方式提取的文件是正常的,但如果附件为文本类型,如txt、pdf等,则解压出来的文件无法正常打开,查看二进制内容,除了原文件还有很多多余的数据,请问如何解析这些数据。
我怀疑你的 ObjectData
是 oleObject
或 Objekt-Manager-Shellobjekt
类型。这些对象存储在嵌入式 oleObject*.bin
文件中。这些文件有自己的文件系统,需要读取。为此,首先获取 DirectoryEntry
和 DirectoryNode
,然后获取 Ole10Native。有了它,您可以获得文件数据 byte[] data = ole10Native.getDataBuffer()
.
完整示例:
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.poifs.filesystem.*;
public class ExcelGetObjectData {
public static void main(String[] args) throws Exception {
//String inFilePath = "./ExcelExampleIn.xlsx"; String outFilePath = "./ExcelExampleOut.xlsx";
String inFilePath = "./ExcelExampleIn.xls"; String outFilePath = "./ExcelExampleOut.xls";
try (Workbook workbook = WorkbookFactory.create(new FileInputStream(inFilePath));
FileOutputStream out = new FileOutputStream(outFilePath ) ) {
Sheet sheet = workbook.getSheetAt(0);
Drawing<?> drawingPatriarch = sheet.getDrawingPatriarch();
if (drawingPatriarch != null) {
for (Shape shape : drawingPatriarch) {
System.out.println(shape);
if (shape instanceof ObjectData) {
ObjectData objectData = (ObjectData) shape;
System.out.println(objectData.getFileName());
System.out.println(objectData.getOLE2ClassName());
System.out.println(objectData.getContentType());
if(objectData.getOLE2ClassName().equals("Objekt-Manager-Shellobjekt")) {
if (objectData.hasDirectoryEntry()) {
DirectoryEntry directory = objectData.getDirectory();
if (directory instanceof DirectoryNode) {
DirectoryNode directoryNode = (DirectoryNode)directory;
Ole10Native ole10Native = Ole10Native.createFromEmbeddedOleObject(directoryNode);
System.out.println(ole10Native.getCommand());
System.out.println(ole10Native.getFileName());
System.out.println(ole10Native.getLabel());
byte[] data = ole10Native.getDataBuffer(); //data now contains the embedded data
try (FileOutputStream dataOut = new FileOutputStream("./" + ole10Native.getLabel())) {
dataOut.write(data);
}
}
}
} else if(objectData.getOLE2ClassName().equals("...")) {
//...
}
} else if (shape instanceof /*other*/Object) {
//...
}
}
}
workbook.write(out);
}
}
}
使用 EmbeddedExtractor 提取所有嵌入对象可以像这样完成:
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.extractor.EmbeddedExtractor;
import org.apache.poi.ss.extractor.EmbeddedData;
public class ExcelEmbeddedExtractor {
public static void main(String[] args) throws Exception {
String inFilePath = "./ExcelExampleIn.xlsx"; String outFilePath = "./ExcelExampleOut.xlsx";
//String inFilePath = "./ExcelExampleIn.xls"; String outFilePath = "./ExcelExampleOut.xls";
try (Workbook workbook = WorkbookFactory.create(new FileInputStream(inFilePath));
FileOutputStream out = new FileOutputStream(outFilePath ) ) {
Sheet sheet = workbook.getSheetAt(0);
EmbeddedExtractor extractor = new EmbeddedExtractor();
for (EmbeddedData embeddedData : extractor.extractAll(sheet)) {
Shape shape = embeddedData.getShape();
System.out.println(shape);
String filename = embeddedData.getFilename();
System.out.println(filename);
byte[] data = embeddedData.getEmbeddedData(); //data now contains the embedded data
try (FileOutputStream dataOut = new FileOutputStream("./" + filename)) {
dataOut.write(data);
}
}
workbook.write(out);
}
}
}
我现在正在使用 Apache POI 从 Excel 文件中提取附件,这是我的部分代码。
Sheet sheetAt = workbook.getSheet(sheetName);
Drawing<?> drawingPatriarch = sheetAt.getDrawingPatriarch();
if (drawingPatriarch != null) {
Iterator<?> iterator = drawingPatriarch.iterator();
if (iterator.hasNext()) {
Object next = iterator.next();
if (next instanceof ObjectData) {
ObjectData o = (ObjectData) next;
IOUtil.write(o.getObjectData(), outputPath);
} else if (next instanceof Picture) {
Picture o = (Picture) next;
IOUtil.write(o.getData(), outputPath);
}
}
}
当附件为二进制类型,如exe、png等时,这种方式提取的文件是正常的,但如果附件为文本类型,如txt、pdf等,则解压出来的文件无法正常打开,查看二进制内容,除了原文件还有很多多余的数据,请问如何解析这些数据。
我怀疑你的 ObjectData
是 oleObject
或 Objekt-Manager-Shellobjekt
类型。这些对象存储在嵌入式 oleObject*.bin
文件中。这些文件有自己的文件系统,需要读取。为此,首先获取 DirectoryEntry
和 DirectoryNode
,然后获取 Ole10Native。有了它,您可以获得文件数据 byte[] data = ole10Native.getDataBuffer()
.
完整示例:
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.poifs.filesystem.*;
public class ExcelGetObjectData {
public static void main(String[] args) throws Exception {
//String inFilePath = "./ExcelExampleIn.xlsx"; String outFilePath = "./ExcelExampleOut.xlsx";
String inFilePath = "./ExcelExampleIn.xls"; String outFilePath = "./ExcelExampleOut.xls";
try (Workbook workbook = WorkbookFactory.create(new FileInputStream(inFilePath));
FileOutputStream out = new FileOutputStream(outFilePath ) ) {
Sheet sheet = workbook.getSheetAt(0);
Drawing<?> drawingPatriarch = sheet.getDrawingPatriarch();
if (drawingPatriarch != null) {
for (Shape shape : drawingPatriarch) {
System.out.println(shape);
if (shape instanceof ObjectData) {
ObjectData objectData = (ObjectData) shape;
System.out.println(objectData.getFileName());
System.out.println(objectData.getOLE2ClassName());
System.out.println(objectData.getContentType());
if(objectData.getOLE2ClassName().equals("Objekt-Manager-Shellobjekt")) {
if (objectData.hasDirectoryEntry()) {
DirectoryEntry directory = objectData.getDirectory();
if (directory instanceof DirectoryNode) {
DirectoryNode directoryNode = (DirectoryNode)directory;
Ole10Native ole10Native = Ole10Native.createFromEmbeddedOleObject(directoryNode);
System.out.println(ole10Native.getCommand());
System.out.println(ole10Native.getFileName());
System.out.println(ole10Native.getLabel());
byte[] data = ole10Native.getDataBuffer(); //data now contains the embedded data
try (FileOutputStream dataOut = new FileOutputStream("./" + ole10Native.getLabel())) {
dataOut.write(data);
}
}
}
} else if(objectData.getOLE2ClassName().equals("...")) {
//...
}
} else if (shape instanceof /*other*/Object) {
//...
}
}
}
workbook.write(out);
}
}
}
使用 EmbeddedExtractor 提取所有嵌入对象可以像这样完成:
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.extractor.EmbeddedExtractor;
import org.apache.poi.ss.extractor.EmbeddedData;
public class ExcelEmbeddedExtractor {
public static void main(String[] args) throws Exception {
String inFilePath = "./ExcelExampleIn.xlsx"; String outFilePath = "./ExcelExampleOut.xlsx";
//String inFilePath = "./ExcelExampleIn.xls"; String outFilePath = "./ExcelExampleOut.xls";
try (Workbook workbook = WorkbookFactory.create(new FileInputStream(inFilePath));
FileOutputStream out = new FileOutputStream(outFilePath ) ) {
Sheet sheet = workbook.getSheetAt(0);
EmbeddedExtractor extractor = new EmbeddedExtractor();
for (EmbeddedData embeddedData : extractor.extractAll(sheet)) {
Shape shape = embeddedData.getShape();
System.out.println(shape);
String filename = embeddedData.getFilename();
System.out.println(filename);
byte[] data = embeddedData.getEmbeddedData(); //data now contains the embedded data
try (FileOutputStream dataOut = new FileOutputStream("./" + filename)) {
dataOut.write(data);
}
}
workbook.write(out);
}
}
}