Apache POI - 将 Word 文档 (docx) 拆分为页面
Apache POI - Split Word document (docx) to pages
我一直在尝试根据预定义的标准将一个 docx 文档分割成多个文档。以下是我将其切成段落的方法
try {
FileInputStream in = new FileInputStream(file);
XWPFDocument doc = new XWPFDocument(in);
List<XWPFParagraph> paragraphs = doc.getParagraphs();
for (int idx = 0; idx < paragraphs.size(); idx++) {
XWPFDocument outputDocument = new XWPFDocument();
createParagraphInAnotherDocument(outputDocument, paragraphs.get(idx).getText());
String fullPath = String.format("./content/output/%1$s_%2$s_%3d.docx", FileUtils.getFileName(file), getName(), idx);
FileOutputStream outputStream = new FileOutputStream(fullPath);
outputDocument.write(outputStream);
outputDocument.close();
doc.close();
}
} catch (IOException e) {
e.printStackTrace();
}
虽然我可以用上面的代码提取段落,但我找不到提取页面的方法。我的理解是word中的页面是render concern,发生在word应用的运行时
据我所知,唯一的方法是查询 Word 文档的 DOM 模型,然后确定每页上有多少段落。以下是该问题的可能解决方案(仅当页面明确由分页符分隔时才有效)
public static void main(String[] args) {
XWPFDocument doc = null;
try {
//Input Word Document
File file = new File("C:/TestDoc.docx");
FileInputStream in = new FileInputStream(file);
doc = new XWPFDocument(in);
//Determine how many paragraphs per page
List<Integer> paragraphCountList = getParagraphCountPerPage(doc);
if (paragraphCountList != null && paragraphCountList.size() > 0) {
int docCount = 0;
int startIndex = 0;
int endIndex = paragraphCountList.get(0);
//Loop through the paragraph counts for each page
for (int i=0; i < paragraphCountList.size(); i++) {
XWPFDocument outputDocument = new XWPFDocument();
List<XWPFParagraph> paragraphs = doc.getParagraphs();
List<XWPFParagraph> pageParagraphs = new ArrayList<XWPFParagraph>();
if (paragraphs != null && paragraphs.size() > 0) {
//Get the paragraphs from the input Word document
for (int j=startIndex; j < endIndex; j++) {
if (paragraphs.get(j) != null) {
pageParagraphs.add(paragraphs.get(j));
}
}
//Set the start and end point for the next set of paragraphs
startIndex = endIndex;
if (i < paragraphCountList.size()-2) {
endIndex = endIndex + paragraphCountList.get(i+1);
} else {
endIndex = paragraphs.size()-1;
}
//Create a new Word Doc with the paragraph subset
createPageInAnotherDocument(outputDocument, pageParagraphs);
//Write the file
String outputPath = "C:/TestDocOutput"+docCount+".docx";
FileOutputStream outputStream = new FileOutputStream(outputPath);
outputDocument.write(outputStream);
outputDocument.close();
docCount++;
pageParagraphs = new ArrayList<XWPFParagraph>();
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
doc.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
private static List<Integer> getParagraphCountPerPage(XWPFDocument doc) throws Exception {
List<Integer> paragraphCountList = new ArrayList<>();
int paragraphCount = 0;
Document domDoc = convertStringToDOM(doc.getDocument().getBody().toString());
NodeList rootChildNodeList = domDoc.getChildNodes().item(0).getChildNodes();
for (int i=0; i < rootChildNodeList.getLength(); i++) {
Node childNode = rootChildNodeList.item(i);
if (childNode.getNodeName().equals("w:p")) {
paragraphCount++;
if (childNode.getChildNodes() != null) {
for (int k=0; k < childNode.getChildNodes().getLength(); k++) {
if (childNode.getChildNodes().item(k).getNodeName().equals("w:r")) {
for (int m=0; m < childNode.getChildNodes().item(k).getChildNodes().getLength(); m++) {
if (childNode.getChildNodes().item(k).getChildNodes().item(m).getNodeName().equals("w:br")) {
paragraphCountList.add(paragraphCount);
paragraphCount = 0;
}
}
}
}
}
}
}
paragraphCountList.add(paragraphCount+1);
return paragraphCountList;
}
private static Document convertStringToDOM(String xmlData) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(new InputSource(new StringReader(xmlData)));
return document;
}
private static void createPageInAnotherDocument(XWPFDocument outputDocument, List<XWPFParagraph> pageParagraphs) throws IOException {
for (int i = 0; i < pageParagraphs.size(); i++) {
addParagraphToDocument(outputDocument, pageParagraphs.get(i).getText());
}
}
private static void addParagraphToDocument(XWPFDocument outputDocument, String text) throws IOException {
XWPFParagraph paragraph = outputDocument.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText(text);
}
我一直在尝试根据预定义的标准将一个 docx 文档分割成多个文档。以下是我将其切成段落的方法
try {
FileInputStream in = new FileInputStream(file);
XWPFDocument doc = new XWPFDocument(in);
List<XWPFParagraph> paragraphs = doc.getParagraphs();
for (int idx = 0; idx < paragraphs.size(); idx++) {
XWPFDocument outputDocument = new XWPFDocument();
createParagraphInAnotherDocument(outputDocument, paragraphs.get(idx).getText());
String fullPath = String.format("./content/output/%1$s_%2$s_%3d.docx", FileUtils.getFileName(file), getName(), idx);
FileOutputStream outputStream = new FileOutputStream(fullPath);
outputDocument.write(outputStream);
outputDocument.close();
doc.close();
}
} catch (IOException e) {
e.printStackTrace();
}
虽然我可以用上面的代码提取段落,但我找不到提取页面的方法。我的理解是word中的页面是render concern,发生在word应用的运行时
据我所知,唯一的方法是查询 Word 文档的 DOM 模型,然后确定每页上有多少段落。以下是该问题的可能解决方案(仅当页面明确由分页符分隔时才有效)
public static void main(String[] args) {
XWPFDocument doc = null;
try {
//Input Word Document
File file = new File("C:/TestDoc.docx");
FileInputStream in = new FileInputStream(file);
doc = new XWPFDocument(in);
//Determine how many paragraphs per page
List<Integer> paragraphCountList = getParagraphCountPerPage(doc);
if (paragraphCountList != null && paragraphCountList.size() > 0) {
int docCount = 0;
int startIndex = 0;
int endIndex = paragraphCountList.get(0);
//Loop through the paragraph counts for each page
for (int i=0; i < paragraphCountList.size(); i++) {
XWPFDocument outputDocument = new XWPFDocument();
List<XWPFParagraph> paragraphs = doc.getParagraphs();
List<XWPFParagraph> pageParagraphs = new ArrayList<XWPFParagraph>();
if (paragraphs != null && paragraphs.size() > 0) {
//Get the paragraphs from the input Word document
for (int j=startIndex; j < endIndex; j++) {
if (paragraphs.get(j) != null) {
pageParagraphs.add(paragraphs.get(j));
}
}
//Set the start and end point for the next set of paragraphs
startIndex = endIndex;
if (i < paragraphCountList.size()-2) {
endIndex = endIndex + paragraphCountList.get(i+1);
} else {
endIndex = paragraphs.size()-1;
}
//Create a new Word Doc with the paragraph subset
createPageInAnotherDocument(outputDocument, pageParagraphs);
//Write the file
String outputPath = "C:/TestDocOutput"+docCount+".docx";
FileOutputStream outputStream = new FileOutputStream(outputPath);
outputDocument.write(outputStream);
outputDocument.close();
docCount++;
pageParagraphs = new ArrayList<XWPFParagraph>();
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
doc.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
private static List<Integer> getParagraphCountPerPage(XWPFDocument doc) throws Exception {
List<Integer> paragraphCountList = new ArrayList<>();
int paragraphCount = 0;
Document domDoc = convertStringToDOM(doc.getDocument().getBody().toString());
NodeList rootChildNodeList = domDoc.getChildNodes().item(0).getChildNodes();
for (int i=0; i < rootChildNodeList.getLength(); i++) {
Node childNode = rootChildNodeList.item(i);
if (childNode.getNodeName().equals("w:p")) {
paragraphCount++;
if (childNode.getChildNodes() != null) {
for (int k=0; k < childNode.getChildNodes().getLength(); k++) {
if (childNode.getChildNodes().item(k).getNodeName().equals("w:r")) {
for (int m=0; m < childNode.getChildNodes().item(k).getChildNodes().getLength(); m++) {
if (childNode.getChildNodes().item(k).getChildNodes().item(m).getNodeName().equals("w:br")) {
paragraphCountList.add(paragraphCount);
paragraphCount = 0;
}
}
}
}
}
}
}
paragraphCountList.add(paragraphCount+1);
return paragraphCountList;
}
private static Document convertStringToDOM(String xmlData) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(new InputSource(new StringReader(xmlData)));
return document;
}
private static void createPageInAnotherDocument(XWPFDocument outputDocument, List<XWPFParagraph> pageParagraphs) throws IOException {
for (int i = 0; i < pageParagraphs.size(); i++) {
addParagraphToDocument(outputDocument, pageParagraphs.get(i).getText());
}
}
private static void addParagraphToDocument(XWPFDocument outputDocument, String text) throws IOException {
XWPFParagraph paragraph = outputDocument.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText(text);
}