将文档转换为 JSON 表示的 Docx4j 功能?
Docx4j functionality to turn a document into JSON representation?
有没有好的方法可以将文档转换成 JSON 表示形式,然后显示在网页上? (要求文档转为JSON)
如果没有内置的方法,我的想法是将 Run/Paragraph 结构表示为 JSON 对象,但我觉得一旦我这样做就不会起作用开始处理更复杂的 Word 文档。
如果您添加:
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-xml</artifactId>
<version>2.11.3</version>
</dependency>
您可以尝试类似的方法:
import org.docx4j.Docx4J;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;
public class ConvertOutJSON {
static String inputfilepath = System.getProperty("user.dir") + "/sample-docs/sample-docxv2.docx";
public static void main(String[] args)
throws Exception {
WordprocessingMLPackage wordMLPackage
= Docx4J.load(new java.io.File(inputfilepath));
String xml = wordMLPackage.getMainDocumentPart().getXML();
//System.out.println(xml);
XmlMapper xmlMapper = new XmlMapper();
JsonNode node = xmlMapper.readTree(xml);
ObjectMapper jsonMapper = new ObjectMapper();
//String json = jsonMapper.writeValueAsString(node);
String json = jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(node);
System.out.println(json);
}
}
但是在快速测试中,我注意到一些 w:p 节点没有作为 JSON 发出。我没有查看它们是否在 readTree 步骤或 ObjectMapper 写入其输出时被 Jackson 丢弃;你需要深入研究 Jackson 来解决这个问题。
它当前正在生成如下输出:
{
"Ignorable" : "w14 wp14",
"body" : {
"p" : {
"rsidR" : "00D15781",
"rsidRDefault" : "00D15781",
"pPr" : {
"ind" : {
"left" : "0"
}
}
},
"tbl" : {
"tblPr" : {
"tblStyle" : {
"val" : "TableGrid"
},
"tblW" : {
"w" : "0",
"type" : "auto"
},
"tblLook" : {
"firstRow" : "1",
"lastRow" : "0",
"firstColumn" : "1",
"lastColumn" : "0",
"noHBand" : "0",
"noVBand" : "1",
"val" : "04A0"
}
},
"tblGrid" : {
"gridCol" : {
"w" : "3561"
}
},
"tr" : {
"rsidR" : "00D15781",
"tc" : {
"tcPr" : {
"tcW" : {
"w" : "7122",
"type" : "dxa"
},
"gridSpan" : {
"val" : "2"
}
},
"p" : {
"rsidR" : "00D15781",
"rsidRDefault" : "00945132",
"pPr" : {
"ind" : {
"left" : "0"
}
},
"r" : {
"t" : "Horizontal merge"
}
}
}
}
},
"sectPr" : {
"rsidR" : "00D15781",
"headerReference" : {
"type" : "default",
"id" : "rId12"
},
"pgSz" : {
"w" : "11907",
"h" : "16839",
"code" : "9"
},
"pgMar" : {
"top" : "720",
"right" : "720",
"bottom" : "720",
"left" : "720",
"header" : "720",
"footer" : "720",
"gutter" : "0"
},
"cols" : {
"space" : "720"
},
"docGrid" : {
"linePitch" : "360"
}
}
}
}
有没有好的方法可以将文档转换成 JSON 表示形式,然后显示在网页上? (要求文档转为JSON)
如果没有内置的方法,我的想法是将 Run/Paragraph 结构表示为 JSON 对象,但我觉得一旦我这样做就不会起作用开始处理更复杂的 Word 文档。
如果您添加:
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-xml</artifactId>
<version>2.11.3</version>
</dependency>
您可以尝试类似的方法:
import org.docx4j.Docx4J;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;
public class ConvertOutJSON {
static String inputfilepath = System.getProperty("user.dir") + "/sample-docs/sample-docxv2.docx";
public static void main(String[] args)
throws Exception {
WordprocessingMLPackage wordMLPackage
= Docx4J.load(new java.io.File(inputfilepath));
String xml = wordMLPackage.getMainDocumentPart().getXML();
//System.out.println(xml);
XmlMapper xmlMapper = new XmlMapper();
JsonNode node = xmlMapper.readTree(xml);
ObjectMapper jsonMapper = new ObjectMapper();
//String json = jsonMapper.writeValueAsString(node);
String json = jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(node);
System.out.println(json);
}
}
但是在快速测试中,我注意到一些 w:p 节点没有作为 JSON 发出。我没有查看它们是否在 readTree 步骤或 ObjectMapper 写入其输出时被 Jackson 丢弃;你需要深入研究 Jackson 来解决这个问题。
它当前正在生成如下输出:
{
"Ignorable" : "w14 wp14",
"body" : {
"p" : {
"rsidR" : "00D15781",
"rsidRDefault" : "00D15781",
"pPr" : {
"ind" : {
"left" : "0"
}
}
},
"tbl" : {
"tblPr" : {
"tblStyle" : {
"val" : "TableGrid"
},
"tblW" : {
"w" : "0",
"type" : "auto"
},
"tblLook" : {
"firstRow" : "1",
"lastRow" : "0",
"firstColumn" : "1",
"lastColumn" : "0",
"noHBand" : "0",
"noVBand" : "1",
"val" : "04A0"
}
},
"tblGrid" : {
"gridCol" : {
"w" : "3561"
}
},
"tr" : {
"rsidR" : "00D15781",
"tc" : {
"tcPr" : {
"tcW" : {
"w" : "7122",
"type" : "dxa"
},
"gridSpan" : {
"val" : "2"
}
},
"p" : {
"rsidR" : "00D15781",
"rsidRDefault" : "00945132",
"pPr" : {
"ind" : {
"left" : "0"
}
},
"r" : {
"t" : "Horizontal merge"
}
}
}
}
},
"sectPr" : {
"rsidR" : "00D15781",
"headerReference" : {
"type" : "default",
"id" : "rId12"
},
"pgSz" : {
"w" : "11907",
"h" : "16839",
"code" : "9"
},
"pgMar" : {
"top" : "720",
"right" : "720",
"bottom" : "720",
"left" : "720",
"header" : "720",
"footer" : "720",
"gutter" : "0"
},
"cols" : {
"space" : "720"
},
"docGrid" : {
"linePitch" : "360"
}
}
}
}