无法计算从非空 XHTML 生成的 docx 文件中的字符数
Cannot count number of characters in a docx file generated from a not empty XHTML
我使用 DocX4J 实现了一个到 DocX 的 XHTML 转换器。它可以毫无问题地创建 DocX 文件。
为了完成我的任务,我决定实施一个简单的测试。测试包括计算创建的 DocX 中的字符数 os,然后将其与 XHTML 中已知的字符数进行比较(参见下面的源代码)。
我的测试代码基于 DocX4J 站点的示例,但对我不起作用。虽然我可以看到我的转换器创建的 DocX 的内容等于 XHTML 文件的内容,但我的测试代码总是 returns 零到 DocX 文件的字符数。 :-\
有谁能帮我找出这个意外结果的原因吗?
提前致谢!
package main;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import org.docx4j.TextUtils;
import org.docx4j.jaxb.Context;
import org.docx4j.openpackaging.contenttype.ContentType;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.exceptions.InvalidFormatException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.PartName;
import org.docx4j.openpackaging.parts.WordprocessingML.AlternativeFormatInputPart;
import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
import org.docx4j.relationships.Relationship;
import org.docx4j.wml.CTAltChunk;
import org.docx4j.wml.Document;
/**
* Count chars from a DocX file generated from a XHTML using Docx4J
*
* @author Cláudio
*/
public class CountChars {
public static void main(String[] args) {
String xhtml = "<html><body><table border=\"1\"><tr><td>Propriedade</td><td>Amostra 1</td><td>Amostra 2</td></tr><tr><td>Prop1</td><td>10.0</td><td>111.0</td></tr><tr><td>Prop2</td><td>20.0</td><td>222.0</td></tr></table></body></html>";
int expectedNChars = 57;
WordprocessingMLPackage docx = export(xhtml);
try {
docx.save(new File("test.docx")); // Proves that docx is
// successfully created
} catch (Docx4JException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (countCharacters(docx) == expectedNChars) {
System.out.println("Success");
} else {
System.out.println("Fail");
}
}
private static WordprocessingMLPackage export(String xhtml) {
WordprocessingMLPackage wordMLPackage = null;
AlternativeFormatInputPart afiPart = null;
Relationship altChunkRel = null;
try {
wordMLPackage = WordprocessingMLPackage.createPackage();
afiPart = new AlternativeFormatInputPart(new PartName("/hw.html"));
} catch (InvalidFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
afiPart.setBinaryData(xhtml.getBytes());
afiPart.setContentType(new ContentType("text/html"));
try {
altChunkRel = wordMLPackage.getMainDocumentPart().addTargetPart(
afiPart);
} catch (InvalidFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// .. the bit in document body
CTAltChunk ac = Context.getWmlObjectFactory().createCTAltChunk();
ac.setId(altChunkRel.getId());
wordMLPackage.getMainDocumentPart().addObject(ac);
// .. content type
wordMLPackage.getContentTypeManager().addDefaultContentType("html",
"text/html");
return wordMLPackage;
}
/**
* Counts chars (even whitespaces) in a docx.
*
* Referência:
* http://www.docx4java.org/forums/docx-java-f6/how-to-count-number
* -of-characters-in-a-docx-file-t767.html
*
* @param docx
* Document
*
* @return Number of chars in the document
*/
private static int countCharacters(WordprocessingMLPackage docx) {
String strString = null;
MainDocumentPart documentPart = docx.getMainDocumentPart();
Document wmlDocument = documentPart.getJaxbElement();
StringWriter strWriter = null;
try {
strWriter = new StringWriter();
TextUtils.extractText(wmlDocument, strWriter);
strString = strWriter.toString();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (strWriter != null) {
try {
strWriter.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
if (strString == null) {
throw new NullPointerException();
}
return strString.length();
}
}
您正在将 XHTML 添加为 AlternativeFormatInputPart (AFIP),这通常由 Word 将 XHTML 转换为真正的 docx 内容。
同时,XHTML 内容不在 MainDocumentPart documentPart 中,它在 AFIP 中。所以当然计算 documentPart 中的字数不会给你你所希望的...
使用 docx4j 2.8.1 方法导出的正确实现应该如下:
private static WordprocessingMLPackage export(String xhtml) {
WordprocessingMLPackage wordMLPackage = null;
try {
wordMLPackage = WordprocessingMLPackage.createPackage();
List<Object> content = XHTMLImporter.convert(xhtml, null,
wordMLPackage);
wordMLPackage.getMainDocumentPart().getContent().addAll(content);
} catch (Docx4JException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return wordMLPackage;
}
我使用 DocX4J 实现了一个到 DocX 的 XHTML 转换器。它可以毫无问题地创建 DocX 文件。
为了完成我的任务,我决定实施一个简单的测试。测试包括计算创建的 DocX 中的字符数 os,然后将其与 XHTML 中已知的字符数进行比较(参见下面的源代码)。
我的测试代码基于 DocX4J 站点的示例,但对我不起作用。虽然我可以看到我的转换器创建的 DocX 的内容等于 XHTML 文件的内容,但我的测试代码总是 returns 零到 DocX 文件的字符数。 :-\
有谁能帮我找出这个意外结果的原因吗?
提前致谢!
package main;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import org.docx4j.TextUtils;
import org.docx4j.jaxb.Context;
import org.docx4j.openpackaging.contenttype.ContentType;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.exceptions.InvalidFormatException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.PartName;
import org.docx4j.openpackaging.parts.WordprocessingML.AlternativeFormatInputPart;
import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
import org.docx4j.relationships.Relationship;
import org.docx4j.wml.CTAltChunk;
import org.docx4j.wml.Document;
/**
* Count chars from a DocX file generated from a XHTML using Docx4J
*
* @author Cláudio
*/
public class CountChars {
public static void main(String[] args) {
String xhtml = "<html><body><table border=\"1\"><tr><td>Propriedade</td><td>Amostra 1</td><td>Amostra 2</td></tr><tr><td>Prop1</td><td>10.0</td><td>111.0</td></tr><tr><td>Prop2</td><td>20.0</td><td>222.0</td></tr></table></body></html>";
int expectedNChars = 57;
WordprocessingMLPackage docx = export(xhtml);
try {
docx.save(new File("test.docx")); // Proves that docx is
// successfully created
} catch (Docx4JException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (countCharacters(docx) == expectedNChars) {
System.out.println("Success");
} else {
System.out.println("Fail");
}
}
private static WordprocessingMLPackage export(String xhtml) {
WordprocessingMLPackage wordMLPackage = null;
AlternativeFormatInputPart afiPart = null;
Relationship altChunkRel = null;
try {
wordMLPackage = WordprocessingMLPackage.createPackage();
afiPart = new AlternativeFormatInputPart(new PartName("/hw.html"));
} catch (InvalidFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
afiPart.setBinaryData(xhtml.getBytes());
afiPart.setContentType(new ContentType("text/html"));
try {
altChunkRel = wordMLPackage.getMainDocumentPart().addTargetPart(
afiPart);
} catch (InvalidFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// .. the bit in document body
CTAltChunk ac = Context.getWmlObjectFactory().createCTAltChunk();
ac.setId(altChunkRel.getId());
wordMLPackage.getMainDocumentPart().addObject(ac);
// .. content type
wordMLPackage.getContentTypeManager().addDefaultContentType("html",
"text/html");
return wordMLPackage;
}
/**
* Counts chars (even whitespaces) in a docx.
*
* Referência:
* http://www.docx4java.org/forums/docx-java-f6/how-to-count-number
* -of-characters-in-a-docx-file-t767.html
*
* @param docx
* Document
*
* @return Number of chars in the document
*/
private static int countCharacters(WordprocessingMLPackage docx) {
String strString = null;
MainDocumentPart documentPart = docx.getMainDocumentPart();
Document wmlDocument = documentPart.getJaxbElement();
StringWriter strWriter = null;
try {
strWriter = new StringWriter();
TextUtils.extractText(wmlDocument, strWriter);
strString = strWriter.toString();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (strWriter != null) {
try {
strWriter.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
if (strString == null) {
throw new NullPointerException();
}
return strString.length();
}
}
您正在将 XHTML 添加为 AlternativeFormatInputPart (AFIP),这通常由 Word 将 XHTML 转换为真正的 docx 内容。
同时,XHTML 内容不在 MainDocumentPart documentPart 中,它在 AFIP 中。所以当然计算 documentPart 中的字数不会给你你所希望的...
使用 docx4j 2.8.1 方法导出的正确实现应该如下:
private static WordprocessingMLPackage export(String xhtml) {
WordprocessingMLPackage wordMLPackage = null;
try {
wordMLPackage = WordprocessingMLPackage.createPackage();
List<Object> content = XHTMLImporter.convert(xhtml, null,
wordMLPackage);
wordMLPackage.getMainDocumentPart().getContent().addAll(content);
} catch (Docx4JException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return wordMLPackage;
}