如何使用 XPATH 或 Apache POI 从 XML 过滤水印文本?

How can I Filter watermark text from XML using XPATH or Apache POI?

这些行在 XML

之后打印
private File file; // path to local docx file
private POITextExtractor textExtractor = ExtractorFactory.createExtractor(file);
XWPFHeader defaultHeader = d.getHeaderFooterPolicy().getDefaultHeader();

String raw_xml = defaultHeader._getHdrFtr().selectPath("*")[0].toString()
<?xml version="1.0" encoding="UTF-8"?>
<xml-fragment xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
   <w:sdtPr>
      <w:id w:val="-1126775779" />
      <w:docPartObj>
         <w:docPartGallery w:val="Watermarks" />
         <w:docPartUnique />
      </w:docPartObj>
   </w:sdtPr>
   <w:sdtContent>
      <w:p w14:paraId="41319DAD" w14:textId="4534348F" w:rsidR="006868D8" w:rsidRDefault="006868D8">
         <w:pPr>
            <w:pStyle w:val="Header" />
         </w:pPr>
         <w:r>
            <w:rPr>
               <w:noProof />
            </w:rPr>
            <w:pict w14:anchorId="63C3AA3C">
               <v:shapetype id="_x0000_t136" coordsize="21600,21600" o:spt="136" adj="10800" path="m@7,l@8,m@5,21600l@6,21600e">
                  <v:formulas>
                     <v:f eqn="sum #0 0 10800" />
                     <v:f eqn="prod #0 2 1" />
                     <v:f eqn="sum 21600 0 @1" />
                     <v:f eqn="sum 0 0 @2" />
                     <v:f eqn="sum 21600 0 @3" />
                     <v:f eqn="if @0 @3 0" />
                     <v:f eqn="if @0 21600 @1" />
                     <v:f eqn="if @0 0 @2" />
                     <v:f eqn="if @0 @4 21600" />
                     <v:f eqn="mid @5 @6" />
                     <v:f eqn="mid @8 @5" />
                     <v:f eqn="mid @7 @8" />
                     <v:f eqn="mid @6 @7" />
                     <v:f eqn="sum @6 0 @5" />
                  </v:formulas>
                  <v:path textpathok="t" o:connecttype="custom" o:connectlocs="@9,0;@10,10800;@11,21600;@12,10800" o:connectangles="270,180,90,0" />
                  <v:textpath on="t" fitshape="t" />
                  <v:handles>
                     <v:h position="#0,bottomRight" xrange="6629,14971" />
                  </v:handles>
                  <o:lock v:ext="edit" text="t" shapetype="t" />
               </v:shapetype>
               <v:shape id="PowerPlusWaterMarkObject357476642" o:spid="_x0000_s1025" type="#_x0000_t136" style="position:absolute;margin-left:0;margin-top:0;width:527.85pt;height:131.95pt;rotation:315;z-index:-251657216;mso-position-horizontal:center;mso-position-horizontal-relative:margin;mso-position-vertical:center;mso-position-vertical-relative:margin" o:allowincell="f" fillcolor="silver" stroked="f">
                  <v:fill opacity=".5" />
                  <v:textpath style="font-family:&quot;Calibri&quot;;font-size:1pt" string="CONFIDENTIAL" />
                  <w10:wrap anchorx="margin" anchory="margin" />
               </v:shape>
            </w:pict>
         </w:r>
      </w:p>
   </w:sdtContent>
</xml-fragment>

以下 XPATH 显示 Confidential

string(//v:shape[contains(@id,'PowerPlusWaterMarkObject')]/v:textpath/@string)

我如何使用此 XPATH 获取 Watermark 的确切值或可能是在 Apache POI 中获取水印的任何其他方法

您已经找到 org.apache.xmlbeans.XmlObject.selectPath。这允许通过 XPATH 选择 XmlObjects。问题是所使用的 XPATH 的可能复杂性受到 JRE 可以使用的 XPATH 计算器类型的限制。

对我来说(Windows 10,JRE 12.0.2)它需要 Saxon-HE-10.6.jar 位于 class 路径中以启用谓词过滤。否则路径 $this//v:shape[@id] 导致 class 未找到异常:java.lang.ClassNotFoundException: net.sf.saxon.sxpath.XPathStaticContext.

完整示例:

import java.io.FileInputStream;

import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeader;

import java.util.StringJoiner;

public class ReadWordWatermarkXWPFXPATH {
    
 static String getWatermarkText(XWPFDocument document) {
  StringJoiner stringJoiner = new StringJoiner(" ");
  for (XWPFHeader header : document.getHeaderList()) {
   org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr ctHdrFtr = header._getHdrFtr();
   String declareNameSpaces = "declare namespace v='urn:schemas-microsoft-com:vml'; ";
   org.apache.xmlbeans.XmlObject[] selectedObjects = ctHdrFtr.selectPath(
    declareNameSpaces 
    + "$this//v:shape[contains(@id,'PowerPlusWaterMarkObject')]/v:textpath/@string");
   for (org.apache.xmlbeans.XmlObject object : selectedObjects) {
    if (object instanceof org.apache.xmlbeans.XmlString) {
     org.apache.xmlbeans.XmlString xmlString = (org.apache.xmlbeans.XmlString)object;
     stringJoiner.add(xmlString.getStringValue());
    }   
   }
  }
  return stringJoiner.toString();
 }

 public static void main(String[] args) throws Exception {

  XWPFDocument document = new XWPFDocument(new FileInputStream("./WordDocument.docx"));
  String watermarkText = getWatermarkText(document);
  System.out.println(watermarkText);
  
 }
}

$this//v:shape 这样的简单路径是可能的,而无需在 class 路径中增加 5 MByte Saxon-HE-10.6.jar

知道了这一点,我们可以做这样的事情:

import java.io.FileInputStream;

import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeader;

import java.util.StringJoiner;

public class ReadWordWatermarkXWPF {
    
 static String getWatermarkText(XWPFDocument document) {
  StringJoiner stringJoiner = new StringJoiner(" ");
  for (XWPFHeader header : document.getHeaderList()) {
   org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr ctHdrFtr = header._getHdrFtr();
   String declareNameSpaces = "declare namespace v='urn:schemas-microsoft-com:vml'; ";
   org.apache.xmlbeans.XmlObject[] selectedObjects = ctHdrFtr.selectPath(
    declareNameSpaces 
    + "$this//v:shape");
   for (org.apache.xmlbeans.XmlObject object : selectedObjects) {
    if (object instanceof com.microsoft.schemas.vml.CTShape) {
     com.microsoft.schemas.vml.CTShape shape = (com.microsoft.schemas.vml.CTShape)object;
     if (shape.getId() != null) {
      String id = shape.getId();
      if (id.contains("PowerPlusWaterMarkObject")) {
       for (com.microsoft.schemas.vml.CTTextPath textPath : shape.getTextpathList()) {
        stringJoiner.add(textPath.getString());
       }
      }
     }
    }
   }    
  }
  return stringJoiner.toString();
 }

 public static void main(String[] args) throws Exception {

  XWPFDocument document = new XWPFDocument(new FileInputStream("./WordDocument.docx"));
  String watermarkText = getWatermarkText(document);
  System.out.println(watermarkText);
  
 }
}

使用 XPATH 解决了我的问题

*//v:shape/v:textpath/@string