如何使用 XPATH 或 Apache POI 从 XML 过滤水印文本?
How can I Filter watermark text from XML using XPATH or Apache POI?
这些行在 XML
之后打印
private File file; // path to local docx file
private POITextExtractor textExtractor = ExtractorFactory.createExtractor(file);
XWPFHeader defaultHeader = d.getHeaderFooterPolicy().getDefaultHeader();
String raw_xml = defaultHeader._getHdrFtr().selectPath("*")[0].toString()
<?xml version="1.0" encoding="UTF-8"?>
<xml-fragment xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
<w:sdtPr>
<w:id w:val="-1126775779" />
<w:docPartObj>
<w:docPartGallery w:val="Watermarks" />
<w:docPartUnique />
</w:docPartObj>
</w:sdtPr>
<w:sdtContent>
<w:p w14:paraId="41319DAD" w14:textId="4534348F" w:rsidR="006868D8" w:rsidRDefault="006868D8">
<w:pPr>
<w:pStyle w:val="Header" />
</w:pPr>
<w:r>
<w:rPr>
<w:noProof />
</w:rPr>
<w:pict w14:anchorId="63C3AA3C">
<v:shapetype id="_x0000_t136" coordsize="21600,21600" o:spt="136" adj="10800" path="m@7,l@8,m@5,21600l@6,21600e">
<v:formulas>
<v:f eqn="sum #0 0 10800" />
<v:f eqn="prod #0 2 1" />
<v:f eqn="sum 21600 0 @1" />
<v:f eqn="sum 0 0 @2" />
<v:f eqn="sum 21600 0 @3" />
<v:f eqn="if @0 @3 0" />
<v:f eqn="if @0 21600 @1" />
<v:f eqn="if @0 0 @2" />
<v:f eqn="if @0 @4 21600" />
<v:f eqn="mid @5 @6" />
<v:f eqn="mid @8 @5" />
<v:f eqn="mid @7 @8" />
<v:f eqn="mid @6 @7" />
<v:f eqn="sum @6 0 @5" />
</v:formulas>
<v:path textpathok="t" o:connecttype="custom" o:connectlocs="@9,0;@10,10800;@11,21600;@12,10800" o:connectangles="270,180,90,0" />
<v:textpath on="t" fitshape="t" />
<v:handles>
<v:h position="#0,bottomRight" xrange="6629,14971" />
</v:handles>
<o:lock v:ext="edit" text="t" shapetype="t" />
</v:shapetype>
<v:shape id="PowerPlusWaterMarkObject357476642" o:spid="_x0000_s1025" type="#_x0000_t136" style="position:absolute;margin-left:0;margin-top:0;width:527.85pt;height:131.95pt;rotation:315;z-index:-251657216;mso-position-horizontal:center;mso-position-horizontal-relative:margin;mso-position-vertical:center;mso-position-vertical-relative:margin" o:allowincell="f" fillcolor="silver" stroked="f">
<v:fill opacity=".5" />
<v:textpath style="font-family:"Calibri";font-size:1pt" string="CONFIDENTIAL" />
<w10:wrap anchorx="margin" anchory="margin" />
</v:shape>
</w:pict>
</w:r>
</w:p>
</w:sdtContent>
</xml-fragment>
以下 XPATH 显示 Confidential
string(//v:shape[contains(@id,'PowerPlusWaterMarkObject')]/v:textpath/@string)
我如何使用此 XPATH 获取 Watermark 的确切值或可能是在 Apache POI 中获取水印的任何其他方法
您已经找到 org.apache.xmlbeans.XmlObject.selectPath
。这允许通过 XPATH 选择 XmlObject
s。问题是所使用的 XPATH 的可能复杂性受到 JRE 可以使用的 XPATH 计算器类型的限制。
对我来说(Windows 10,JRE 12.0.2)它需要 Saxon-HE-10.6.jar
位于 class 路径中以启用谓词过滤。否则路径 $this//v:shape[@id]
导致 class 未找到异常:java.lang.ClassNotFoundException: net.sf.saxon.sxpath.XPathStaticContext
.
完整示例:
import java.io.FileInputStream;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import java.util.StringJoiner;
public class ReadWordWatermarkXWPFXPATH {
static String getWatermarkText(XWPFDocument document) {
StringJoiner stringJoiner = new StringJoiner(" ");
for (XWPFHeader header : document.getHeaderList()) {
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr ctHdrFtr = header._getHdrFtr();
String declareNameSpaces = "declare namespace v='urn:schemas-microsoft-com:vml'; ";
org.apache.xmlbeans.XmlObject[] selectedObjects = ctHdrFtr.selectPath(
declareNameSpaces
+ "$this//v:shape[contains(@id,'PowerPlusWaterMarkObject')]/v:textpath/@string");
for (org.apache.xmlbeans.XmlObject object : selectedObjects) {
if (object instanceof org.apache.xmlbeans.XmlString) {
org.apache.xmlbeans.XmlString xmlString = (org.apache.xmlbeans.XmlString)object;
stringJoiner.add(xmlString.getStringValue());
}
}
}
return stringJoiner.toString();
}
public static void main(String[] args) throws Exception {
XWPFDocument document = new XWPFDocument(new FileInputStream("./WordDocument.docx"));
String watermarkText = getWatermarkText(document);
System.out.println(watermarkText);
}
}
像 $this//v:shape
这样的简单路径是可能的,而无需在 class 路径中增加 5 MByte Saxon-HE-10.6.jar
。
知道了这一点,我们可以做这样的事情:
import java.io.FileInputStream;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import java.util.StringJoiner;
public class ReadWordWatermarkXWPF {
static String getWatermarkText(XWPFDocument document) {
StringJoiner stringJoiner = new StringJoiner(" ");
for (XWPFHeader header : document.getHeaderList()) {
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr ctHdrFtr = header._getHdrFtr();
String declareNameSpaces = "declare namespace v='urn:schemas-microsoft-com:vml'; ";
org.apache.xmlbeans.XmlObject[] selectedObjects = ctHdrFtr.selectPath(
declareNameSpaces
+ "$this//v:shape");
for (org.apache.xmlbeans.XmlObject object : selectedObjects) {
if (object instanceof com.microsoft.schemas.vml.CTShape) {
com.microsoft.schemas.vml.CTShape shape = (com.microsoft.schemas.vml.CTShape)object;
if (shape.getId() != null) {
String id = shape.getId();
if (id.contains("PowerPlusWaterMarkObject")) {
for (com.microsoft.schemas.vml.CTTextPath textPath : shape.getTextpathList()) {
stringJoiner.add(textPath.getString());
}
}
}
}
}
}
return stringJoiner.toString();
}
public static void main(String[] args) throws Exception {
XWPFDocument document = new XWPFDocument(new FileInputStream("./WordDocument.docx"));
String watermarkText = getWatermarkText(document);
System.out.println(watermarkText);
}
}
使用 XPATH 解决了我的问题
*//v:shape/v:textpath/@string
这些行在 XML
之后打印private File file; // path to local docx file
private POITextExtractor textExtractor = ExtractorFactory.createExtractor(file);
XWPFHeader defaultHeader = d.getHeaderFooterPolicy().getDefaultHeader();
String raw_xml = defaultHeader._getHdrFtr().selectPath("*")[0].toString()
<?xml version="1.0" encoding="UTF-8"?>
<xml-fragment xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16sdtdh="http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
<w:sdtPr>
<w:id w:val="-1126775779" />
<w:docPartObj>
<w:docPartGallery w:val="Watermarks" />
<w:docPartUnique />
</w:docPartObj>
</w:sdtPr>
<w:sdtContent>
<w:p w14:paraId="41319DAD" w14:textId="4534348F" w:rsidR="006868D8" w:rsidRDefault="006868D8">
<w:pPr>
<w:pStyle w:val="Header" />
</w:pPr>
<w:r>
<w:rPr>
<w:noProof />
</w:rPr>
<w:pict w14:anchorId="63C3AA3C">
<v:shapetype id="_x0000_t136" coordsize="21600,21600" o:spt="136" adj="10800" path="m@7,l@8,m@5,21600l@6,21600e">
<v:formulas>
<v:f eqn="sum #0 0 10800" />
<v:f eqn="prod #0 2 1" />
<v:f eqn="sum 21600 0 @1" />
<v:f eqn="sum 0 0 @2" />
<v:f eqn="sum 21600 0 @3" />
<v:f eqn="if @0 @3 0" />
<v:f eqn="if @0 21600 @1" />
<v:f eqn="if @0 0 @2" />
<v:f eqn="if @0 @4 21600" />
<v:f eqn="mid @5 @6" />
<v:f eqn="mid @8 @5" />
<v:f eqn="mid @7 @8" />
<v:f eqn="mid @6 @7" />
<v:f eqn="sum @6 0 @5" />
</v:formulas>
<v:path textpathok="t" o:connecttype="custom" o:connectlocs="@9,0;@10,10800;@11,21600;@12,10800" o:connectangles="270,180,90,0" />
<v:textpath on="t" fitshape="t" />
<v:handles>
<v:h position="#0,bottomRight" xrange="6629,14971" />
</v:handles>
<o:lock v:ext="edit" text="t" shapetype="t" />
</v:shapetype>
<v:shape id="PowerPlusWaterMarkObject357476642" o:spid="_x0000_s1025" type="#_x0000_t136" style="position:absolute;margin-left:0;margin-top:0;width:527.85pt;height:131.95pt;rotation:315;z-index:-251657216;mso-position-horizontal:center;mso-position-horizontal-relative:margin;mso-position-vertical:center;mso-position-vertical-relative:margin" o:allowincell="f" fillcolor="silver" stroked="f">
<v:fill opacity=".5" />
<v:textpath style="font-family:"Calibri";font-size:1pt" string="CONFIDENTIAL" />
<w10:wrap anchorx="margin" anchory="margin" />
</v:shape>
</w:pict>
</w:r>
</w:p>
</w:sdtContent>
</xml-fragment>
以下 XPATH 显示 Confidential
string(//v:shape[contains(@id,'PowerPlusWaterMarkObject')]/v:textpath/@string)
我如何使用此 XPATH 获取 Watermark 的确切值或可能是在 Apache POI 中获取水印的任何其他方法
您已经找到 org.apache.xmlbeans.XmlObject.selectPath
。这允许通过 XPATH 选择 XmlObject
s。问题是所使用的 XPATH 的可能复杂性受到 JRE 可以使用的 XPATH 计算器类型的限制。
对我来说(Windows 10,JRE 12.0.2)它需要 Saxon-HE-10.6.jar
位于 class 路径中以启用谓词过滤。否则路径 $this//v:shape[@id]
导致 class 未找到异常:java.lang.ClassNotFoundException: net.sf.saxon.sxpath.XPathStaticContext
.
完整示例:
import java.io.FileInputStream;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import java.util.StringJoiner;
public class ReadWordWatermarkXWPFXPATH {
static String getWatermarkText(XWPFDocument document) {
StringJoiner stringJoiner = new StringJoiner(" ");
for (XWPFHeader header : document.getHeaderList()) {
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr ctHdrFtr = header._getHdrFtr();
String declareNameSpaces = "declare namespace v='urn:schemas-microsoft-com:vml'; ";
org.apache.xmlbeans.XmlObject[] selectedObjects = ctHdrFtr.selectPath(
declareNameSpaces
+ "$this//v:shape[contains(@id,'PowerPlusWaterMarkObject')]/v:textpath/@string");
for (org.apache.xmlbeans.XmlObject object : selectedObjects) {
if (object instanceof org.apache.xmlbeans.XmlString) {
org.apache.xmlbeans.XmlString xmlString = (org.apache.xmlbeans.XmlString)object;
stringJoiner.add(xmlString.getStringValue());
}
}
}
return stringJoiner.toString();
}
public static void main(String[] args) throws Exception {
XWPFDocument document = new XWPFDocument(new FileInputStream("./WordDocument.docx"));
String watermarkText = getWatermarkText(document);
System.out.println(watermarkText);
}
}
像 $this//v:shape
这样的简单路径是可能的,而无需在 class 路径中增加 5 MByte Saxon-HE-10.6.jar
。
知道了这一点,我们可以做这样的事情:
import java.io.FileInputStream;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import java.util.StringJoiner;
public class ReadWordWatermarkXWPF {
static String getWatermarkText(XWPFDocument document) {
StringJoiner stringJoiner = new StringJoiner(" ");
for (XWPFHeader header : document.getHeaderList()) {
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr ctHdrFtr = header._getHdrFtr();
String declareNameSpaces = "declare namespace v='urn:schemas-microsoft-com:vml'; ";
org.apache.xmlbeans.XmlObject[] selectedObjects = ctHdrFtr.selectPath(
declareNameSpaces
+ "$this//v:shape");
for (org.apache.xmlbeans.XmlObject object : selectedObjects) {
if (object instanceof com.microsoft.schemas.vml.CTShape) {
com.microsoft.schemas.vml.CTShape shape = (com.microsoft.schemas.vml.CTShape)object;
if (shape.getId() != null) {
String id = shape.getId();
if (id.contains("PowerPlusWaterMarkObject")) {
for (com.microsoft.schemas.vml.CTTextPath textPath : shape.getTextpathList()) {
stringJoiner.add(textPath.getString());
}
}
}
}
}
}
return stringJoiner.toString();
}
public static void main(String[] args) throws Exception {
XWPFDocument document = new XWPFDocument(new FileInputStream("./WordDocument.docx"));
String watermarkText = getWatermarkText(document);
System.out.println(watermarkText);
}
}
使用 XPATH 解决了我的问题
*//v:shape/v:textpath/@string