需要使用 docx4j Java 基于字符串拆分 docx 文件?

Need to split docx file based on string using docx4j Java?

我是 Docx4j 的新手,需要帮助使用 docx4j Java 根据字符串拆分 docx 文件,以便将输出写入多个文件。

我尝试使用 Apache POI 做同样的事情并得到了输出,但是当试图将其转换为 HTML 时,遇到了样式丢失的问题,后来又添加了样式,仍然面临同样的问题。

下面是使用 apache poi 的代码:

public static int pos = 0;
    public static int posc = 0;
    public static String ind = "n";
    final static int DEFAULT_FONT_SIZE = 10;

    public static void main(String[] args) throws FileNotFoundException,
            IOException, XmlException {

        File file = null;
        File outfilep = null;
        File outfilec = null;

        File dir = new File(PropertyUtils.getProperty("INPUT_DIR"));
        String[] files = dir.list();

        if (files.length == 0) {
            System.out.println("The directory is empty");
        } else {
            for (String aFile : files) {
                System.out.println(aFile);
                file = new File(PropertyUtils.getProperty("INPUT_DIR") + aFile
                        + "/" + aFile + ".docx");
                outfilep = new File(PropertyUtils.getProperty("INPUT_DIR")
                        + aFile + "/" + aFile + "-Product.docx");

                outfilec = new File(PropertyUtils.getProperty("INPUT_DIR")
                        + aFile + "/" + aFile + "-Component.docx");

                // Write Soruce file
            }
        }

        XWPFDocument doc = new XWPFDocument(new FileInputStream(file));

        XWPFDocument destDoc = new XWPFDocument();

        copyLayout(doc, destDoc);

        XWPFDocument destDocc = new XWPFDocument();

        OutputStream out = new FileOutputStream(outfilep);
        OutputStream outc = new FileOutputStream(outfilec);

        for (IBodyElement bodyElement : doc.getBodyElements()) {

            BodyElementType elementType = bodyElement.getElementType();

            if (elementType.name().equals("PARAGRAPH")) {

                XWPFParagraph pr = (XWPFParagraph) bodyElement;

                if (pr.getText().contains("CONSTRUCTION DETAILS:"))

                {
                    ind = "y";
                    System.out.println("ind is Y++++++++++++");
                }

                if (ind == "n")

                {

                    copyStyle(doc, destDoc,
                            doc.getStyles().getStyle(pr.getStyleID()));

                    XWPFParagraph dstPr = destDoc.createParagraph();

                    dstPr.createRun();

                    pos = destDoc.getParagraphs().size() - 1;

                      CTPPr ppr = pr.getCTP().getPPr();
                        if (ppr == null) ppr = pr.getCTP().addNewPPr();
                        CTSpacing spacing = ppr.isSetSpacing()? ppr.getSpacing() : ppr.addNewSpacing();
                        spacing.setAfter(BigInteger.valueOf(0));
                        spacing.setBefore(BigInteger.valueOf(0));
                        spacing.setLineRule(STLineSpacingRule.AUTO);
                        spacing.setLine(BigInteger.valueOf(240));

                    destDoc.setParagraph(pr, pos);
//                  System.out.println("prod "
//                           + destDoc.getParagraphArray(pos).getParagraphText());

                }

                else {
                    copyStyle(doc, destDocc,
                            doc.getStyles().getStyle(pr.getStyleID()));

                    XWPFParagraph dstPrr = destDocc.createParagraph();

                    dstPrr.createRun();

                    pos = destDocc.getParagraphs().size() - 1;
                      CTPPr ppr = pr.getCTP().getPPr();
                        if (ppr == null) ppr = pr.getCTP().addNewPPr();
                        CTSpacing spacing = ppr.isSetSpacing()? ppr.getSpacing() : ppr.addNewSpacing();
                        spacing.setAfter(BigInteger.valueOf(0));
                        spacing.setBefore(BigInteger.valueOf(0));
                        spacing.setLineRule(STLineSpacingRule.AUTO);
                        spacing.setLine(BigInteger.valueOf(240));

                    destDocc.setParagraph(pr, pos);
////                    System.out.println("comp  "
////                             + destDoc.getParagraphArray(pos).getParagraphText());
                }



            } else if (elementType.name().equals("TABLE")) {

                XWPFTable table = (XWPFTable) bodyElement;

                if (ind == "n")

                {

                    copyStyle(doc, destDoc,
                            doc.getStyles().getStyle(table.getStyleID()));

                    destDoc.createTable();

                     pos = destDoc.getTables().size() - 1;

                    destDoc.setTable(pos, table);

//                   System.out.println("prodtable   "       + destDoc.getParagraphArray(pos).getParagraphText());

                } 
                else {

                    copyStyle(doc, destDocc,
                            doc.getStyles().getStyle(table.getStyleID()));

                    destDocc.createTable();

                     pos = destDocc.getTables().size() - 1;

                    destDocc.setTable(pos, table);

//                  System.out.println("comptable   "        + destDoc.getParagraphArray(pos).getParagraphText());
                }


            }
        }

        destDoc.write(out);
        destDocc.write(outc);
    }

    // Copy Styles of Table and Paragraph.
    private static void copyStyle(XWPFDocument srcDoc, XWPFDocument destDoc,
            XWPFStyle style) {
        if (destDoc == null || style == null)
            return;

        if (destDoc.getStyles() == null) {
            destDoc.createStyles();
        }

        List<XWPFStyle> usedStyleList = srcDoc.getStyles().getUsedStyleList(
                style);
        for (XWPFStyle xwpfStyle : usedStyleList) {
            destDoc.getStyles().addStyle(xwpfStyle);
        }
    }

      private static void copyLayout(XWPFDocument srcDoc, XWPFDocument destDoc)
        {
            CTPageMar pgMar = srcDoc.getDocument().getBody().getSectPr().getPgMar();

            BigInteger bottom = pgMar.getBottom();
            BigInteger footer = pgMar.getFooter();
            BigInteger gutter = pgMar.getGutter();
            BigInteger header = pgMar.getHeader();
            BigInteger left = pgMar.getLeft();
            BigInteger right = pgMar.getRight();
            BigInteger top = pgMar.getTop();

            CTPageMar addNewPgMar = destDoc.getDocument().getBody().addNewSectPr().addNewPgMar();

            addNewPgMar.setBottom(bottom);
            addNewPgMar.setFooter(footer);
            addNewPgMar.setGutter(gutter);
            addNewPgMar.setHeader(header);
            addNewPgMar.setLeft(left);
            addNewPgMar.setRight(right);
            addNewPgMar.setTop(top);

            CTPageSz pgSzSrc = srcDoc.getDocument().getBody().getSectPr().getPgSz();

            BigInteger code = pgSzSrc.getCode();
            BigInteger h = pgSzSrc.getH();
            Enum orient = pgSzSrc.getOrient();
            BigInteger w = pgSzSrc.getW();

            CTPageSz addNewPgSz = destDoc.getDocument().getBody().addNewSectPr().addNewPgSz();

            addNewPgSz.setCode(code);
            addNewPgSz.setH(h);
            addNewPgSz.setOrient(orient);
            addNewPgSz.setW(w);
        }

以蛮力方式拆分 docx 很容易:您可以删除不需要的内容(段落等),然后保存结果。

这样,原始关系将保持不变,但您的 docx 容器可能会比需要的更大,因为它可能包含不再使用的图像等。

这样做了,还有一些需要注意的地方:

  • 在书签开始和结束标记之间拆分(与评论相同)
  • 自动编号可能会给出错误的起始编号,除非您设置起始于

显然您可以编写代码来解决此类问题。

或者,对于我们的 docx4j 商业企业版,您可以使用它的 "merge" 代码来表示您想要说第 X 到 Y 段,它会给您一个仅包含这些内容的 docx(即没有无关的docx 容器中的图像、处理的拆分书签等)。

希望这能解决问题。

public class SplitUsingDocx4j {

/**
 * @param args
 * @throws Docx4JException
 * @throws FileNotFoundException
 */
public static void main(String[] args) throws Docx4JException,
        FileNotFoundException {
    File dir = new File(PropertyUtils.getProperty("INPUT_DIR"));
    String[] files = dir.list();
    File file = null;
    if (files.length == 0) {
        System.out.println("The directory is empty");
    } else {
        for (String aFile : files) {
            System.out.println(aFile);

            file = new File(PropertyUtils.getProperty("INPUT_DIR") + aFile
                    + "/" + aFile + ".docx");
        }
    }

    // Creating new documents
    WordprocessingMLPackage doc1 = WordprocessingMLPackage.createPackage();
    WordprocessingMLPackage doc2 = WordprocessingMLPackage.createPackage();

    // loading existing document
    WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage
            .load(new java.io.File(file.getPath()));
    MainDocumentPart tempDocPart = wordMLPackage.getMainDocumentPart();
    List<Object> obj = wordMLPackage.getMainDocumentPart().getContent();

    // for copying styles from existing doc to new docs
    StyleDefinitionsPart sdp = tempDocPart.getStyleDefinitionsPart();
    Styles tempStyle = sdp.getJaxbElement();
    doc1.getMainDocumentPart().getStyleDefinitionsPart()
            .setJaxbElement(tempStyle);
    doc2.getMainDocumentPart().getStyleDefinitionsPart()
            .setJaxbElement(tempStyle);

    boolean flag = false;
    for (Object object : obj) {
        if (!flag) {
            if (object.toString().equalsIgnoreCase("CONSTRUCTION DETAILS:")) {
                flag = true;
            }
            doc1.getMainDocumentPart().addObject(object);
        } else {
            doc2.getMainDocumentPart().addObject(object);
        }

    }
    String fileName = file.getName().toString().replace(".docx", "");
    doc1.save(new File(fileName + "-1.docx"));
    doc2.save(new File(fileName + "-2.docx"));
}}