需要使用 docx4j Java 基于字符串拆分 docx 文件？

Question

我是 Docx4j 的新手，需要帮助使用 docx4j Java 根据字符串拆分 docx 文件，以便将输出写入多个文件。

我尝试使用 Apache POI 做同样的事情并得到了输出，但是当试图将其转换为 HTML 时，遇到了样式丢失的问题，后来又添加了样式，仍然面临同样的问题。

下面是使用 apache poi 的代码：

public static int pos = 0;
    public static int posc = 0;
    public static String ind = "n";
    final static int DEFAULT_FONT_SIZE = 10;

    public static void main(String[] args) throws FileNotFoundException,
            IOException, XmlException {

        File file = null;
        File outfilep = null;
        File outfilec = null;

        File dir = new File(PropertyUtils.getProperty("INPUT_DIR"));
        String[] files = dir.list();

        if (files.length == 0) {
            System.out.println("The directory is empty");
        } else {
            for (String aFile : files) {
                System.out.println(aFile);
                file = new File(PropertyUtils.getProperty("INPUT_DIR") + aFile
                        + "/" + aFile + ".docx");
                outfilep = new File(PropertyUtils.getProperty("INPUT_DIR")
                        + aFile + "/" + aFile + "-Product.docx");

                outfilec = new File(PropertyUtils.getProperty("INPUT_DIR")
                        + aFile + "/" + aFile + "-Component.docx");

                // Write Soruce file
            }
        }

        XWPFDocument doc = new XWPFDocument(new FileInputStream(file));

        XWPFDocument destDoc = new XWPFDocument();

        copyLayout(doc, destDoc);

        XWPFDocument destDocc = new XWPFDocument();

        OutputStream out = new FileOutputStream(outfilep);
        OutputStream outc = new FileOutputStream(outfilec);

        for (IBodyElement bodyElement : doc.getBodyElements()) {

            BodyElementType elementType = bodyElement.getElementType();

            if (elementType.name().equals("PARAGRAPH")) {

                XWPFParagraph pr = (XWPFParagraph) bodyElement;

                if (pr.getText().contains("CONSTRUCTION DETAILS:"))

                {
                    ind = "y";
                    System.out.println("ind is Y++++++++++++");
                }

                if (ind == "n")

                {

                    copyStyle(doc, destDoc,
                            doc.getStyles().getStyle(pr.getStyleID()));

                    XWPFParagraph dstPr = destDoc.createParagraph();

                    dstPr.createRun();

                    pos = destDoc.getParagraphs().size() - 1;

                      CTPPr ppr = pr.getCTP().getPPr();
                        if (ppr == null) ppr = pr.getCTP().addNewPPr();
                        CTSpacing spacing = ppr.isSetSpacing()? ppr.getSpacing() : ppr.addNewSpacing();
                        spacing.setAfter(BigInteger.valueOf(0));
                        spacing.setBefore(BigInteger.valueOf(0));
                        spacing.setLineRule(STLineSpacingRule.AUTO);
                        spacing.setLine(BigInteger.valueOf(240));

                    destDoc.setParagraph(pr, pos);
//                  System.out.println("prod "
//                           + destDoc.getParagraphArray(pos).getParagraphText());

                }

                else {
                    copyStyle(doc, destDocc,
                            doc.getStyles().getStyle(pr.getStyleID()));

                    XWPFParagraph dstPrr = destDocc.createParagraph();

                    dstPrr.createRun();

                    pos = destDocc.getParagraphs().size() - 1;
                      CTPPr ppr = pr.getCTP().getPPr();
                        if (ppr == null) ppr = pr.getCTP().addNewPPr();
                        CTSpacing spacing = ppr.isSetSpacing()? ppr.getSpacing() : ppr.addNewSpacing();
                        spacing.setAfter(BigInteger.valueOf(0));
                        spacing.setBefore(BigInteger.valueOf(0));
                        spacing.setLineRule(STLineSpacingRule.AUTO);
                        spacing.setLine(BigInteger.valueOf(240));

                    destDocc.setParagraph(pr, pos);
////                    System.out.println("comp  "
////                             + destDoc.getParagraphArray(pos).getParagraphText());
                }



            } else if (elementType.name().equals("TABLE")) {

                XWPFTable table = (XWPFTable) bodyElement;

                if (ind == "n")

                {

                    copyStyle(doc, destDoc,
                            doc.getStyles().getStyle(table.getStyleID()));

                    destDoc.createTable();

                     pos = destDoc.getTables().size() - 1;

                    destDoc.setTable(pos, table);

//                   System.out.println("prodtable   "       + destDoc.getParagraphArray(pos).getParagraphText());

                } 
                else {

                    copyStyle(doc, destDocc,
                            doc.getStyles().getStyle(table.getStyleID()));

                    destDocc.createTable();

                     pos = destDocc.getTables().size() - 1;

                    destDocc.setTable(pos, table);

//                  System.out.println("comptable   "        + destDoc.getParagraphArray(pos).getParagraphText());
                }


            }
        }

        destDoc.write(out);
        destDocc.write(outc);
    }

    // Copy Styles of Table and Paragraph.
    private static void copyStyle(XWPFDocument srcDoc, XWPFDocument destDoc,
            XWPFStyle style) {
        if (destDoc == null || style == null)
            return;

        if (destDoc.getStyles() == null) {
            destDoc.createStyles();
        }

        List<XWPFStyle> usedStyleList = srcDoc.getStyles().getUsedStyleList(
                style);
        for (XWPFStyle xwpfStyle : usedStyleList) {
            destDoc.getStyles().addStyle(xwpfStyle);
        }
    }

      private static void copyLayout(XWPFDocument srcDoc, XWPFDocument destDoc)
        {
            CTPageMar pgMar = srcDoc.getDocument().getBody().getSectPr().getPgMar();

            BigInteger bottom = pgMar.getBottom();
            BigInteger footer = pgMar.getFooter();
            BigInteger gutter = pgMar.getGutter();
            BigInteger header = pgMar.getHeader();
            BigInteger left = pgMar.getLeft();
            BigInteger right = pgMar.getRight();
            BigInteger top = pgMar.getTop();

            CTPageMar addNewPgMar = destDoc.getDocument().getBody().addNewSectPr().addNewPgMar();

            addNewPgMar.setBottom(bottom);
            addNewPgMar.setFooter(footer);
            addNewPgMar.setGutter(gutter);
            addNewPgMar.setHeader(header);
            addNewPgMar.setLeft(left);
            addNewPgMar.setRight(right);
            addNewPgMar.setTop(top);

            CTPageSz pgSzSrc = srcDoc.getDocument().getBody().getSectPr().getPgSz();

            BigInteger code = pgSzSrc.getCode();
            BigInteger h = pgSzSrc.getH();
            Enum orient = pgSzSrc.getOrient();
            BigInteger w = pgSzSrc.getW();

            CTPageSz addNewPgSz = destDoc.getDocument().getBody().addNewSectPr().addNewPgSz();

            addNewPgSz.setCode(code);
            addNewPgSz.setH(h);
            addNewPgSz.setOrient(orient);
            addNewPgSz.setW(w);
        }

Answer 1

以蛮力方式拆分 docx 很容易：您可以删除不需要的内容（段落等），然后保存结果。

这样，原始关系将保持不变，但您的 docx 容器可能会比需要的更大，因为它可能包含不再使用的图像等。

这样做了，还有一些需要注意的地方：

在书签开始和结束标记之间拆分（与评论相同）
自动编号可能会给出错误的起始编号，除非您设置起始于

显然您可以编写代码来解决此类问题。

或者，对于我们的 docx4j 商业企业版，您可以使用它的 "merge" 代码来表示您想要说第 X 到 Y 段，它会给您一个仅包含这些内容的 docx（即没有无关的docx 容器中的图像、处理的拆分书签等）。

Answer 2

希望这能解决问题。

public class SplitUsingDocx4j {

/**
 * @param args
 * @throws Docx4JException
 * @throws FileNotFoundException
 */
public static void main(String[] args) throws Docx4JException,
        FileNotFoundException {
    File dir = new File(PropertyUtils.getProperty("INPUT_DIR"));
    String[] files = dir.list();
    File file = null;
    if (files.length == 0) {
        System.out.println("The directory is empty");
    } else {
        for (String aFile : files) {
            System.out.println(aFile);

            file = new File(PropertyUtils.getProperty("INPUT_DIR") + aFile
                    + "/" + aFile + ".docx");
        }
    }

    // Creating new documents
    WordprocessingMLPackage doc1 = WordprocessingMLPackage.createPackage();
    WordprocessingMLPackage doc2 = WordprocessingMLPackage.createPackage();

    // loading existing document
    WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage
            .load(new java.io.File(file.getPath()));
    MainDocumentPart tempDocPart = wordMLPackage.getMainDocumentPart();
    List<Object> obj = wordMLPackage.getMainDocumentPart().getContent();

    // for copying styles from existing doc to new docs
    StyleDefinitionsPart sdp = tempDocPart.getStyleDefinitionsPart();
    Styles tempStyle = sdp.getJaxbElement();
    doc1.getMainDocumentPart().getStyleDefinitionsPart()
            .setJaxbElement(tempStyle);
    doc2.getMainDocumentPart().getStyleDefinitionsPart()
            .setJaxbElement(tempStyle);

    boolean flag = false;
    for (Object object : obj) {
        if (!flag) {
            if (object.toString().equalsIgnoreCase("CONSTRUCTION DETAILS:")) {
                flag = true;
            }
            doc1.getMainDocumentPart().addObject(object);
        } else {
            doc2.getMainDocumentPart().addObject(object);
        }

    }
    String fileName = file.getName().toString().replace(".docx", "");
    doc1.save(new File(fileName + "-1.docx"));
    doc2.save(new File(fileName + "-2.docx"));
}}

需要使用 docx4j Java 基于字符串拆分 docx 文件？

Need to split docx file based on string using docx4j Java?

java

apache-poi

docx4j