VTD-XML - 无法在 span 标记后获取文本
VTD-XML - Not able to get text after span tag
<?xml version="1.0"?>
<catalog>
<book id="bk001" type='fiction'>
<author>Gambardella, Matthew</author>
<author>Doe, John</author>
<title>XML IN-DEPT Developer's Guide</title>
<genre>Computer</genre>
<price>44.95</price>
<snippet>
<inlineXML contenttype="application/xhtml+xml" >
<html lang="en-US" >
<head>
<title>XML IN-DEPT Developer's Guide</title>
</head>
<body>
<p>This is an example book for developers want to gain knowledge on <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p>
</body>
</html>
</inlineXML>
</snippet>
</book>
</catalog>
以上是 XML 示例,我想计算 XPath 表达式“/book/snippet”并遍历所有元素并获取文本。我正在使用 VTD-XML 库使用此 () 修改后的代码(如下面的更新中所示),但问题是它在遇到 span 标记后无法获取文本。所以我现在得到的段落标签的输出是:
Level [6] Tag [p]
This is an example book for developers want to gain knowledge on
Level [7] Tag [span] @class=boldcls
XML
Level [8] Tag [span] @class=boldcls
XML parsing and editing
错了,应该是:
Level [6] Tag [p]
This is an example book for developers want to gain knowledge on XML Marshalling and UnMarshalling. Need to know all about XML parsing and editing, Grab this Book!
Level [7] Tag [span] @class=boldcls
XML
Level [8] Tag [span] @class=boldcls
XML parsing and editing
更新:
我稍微修改了示例代码:
package com.vtd.test;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import com.ximpleware.AutoPilot;
import com.ximpleware.NavException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
public class VTDXMLReader {
// private String xpathExpression;
private VTDNav vtdNav;
private AutoPilot autoPilot;
private boolean includeAttributes;
private String attribute;
public VTDXMLReader(final Document storyDoc, final boolean includeAttributes, final String xpathExpression) {
this.includeAttributes = includeAttributes;
// this.xpathExpression = xpathExpression;
final VTDGen vtdGen = new VTDGen();
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
DOMSource source = new DOMSource(storyDoc);
StreamResult result = new StreamResult(baos);
transformer.transform(source, result);
byte[] array = baos.toByteArray();
vtdGen.setDoc(array);
vtdGen.parse(true);
} catch (Exception ex) {
ex.printStackTrace();
}
vtdNav = vtdGen.getNav();
autoPilot = new AutoPilot(vtdNav);
String[] xpathFrags = xpathExpression.split("/");
if (xpathFrags[xpathFrags.length - 1].startsWith("@")) {
attribute = xpathFrags[xpathFrags.length - 1].replaceAll("@", "");
}
try {
autoPilot.selectXPath(xpathExpression);
} catch (XPathParseException e) {
e.printStackTrace();
}
}
public List<String> readXML() throws IOException {
List<String> values = new ArrayList<String>();
try {
while (autoPilot.evalXPath() != -1) {
// printTag(vn);
if (includeAttributes) {
Map<String, String> amap = new LinkedHashMap<String, String>();
loadAttributeMap(vtdNav, amap);
for (String aname : amap.keySet()) {
String aval = amap.get(aname);
values.add(aval);
// System.out.print(" @" + aname + "=" + aval);
}
// System.out.print("\n");
}
int val = 0;
if (attribute != null && !attribute.isEmpty()) {
val = vtdNav.getAttrVal(attribute);
if (val != -1) {
String id = vtdNav.toNormalizedString(val);
values.add(id);
// System.out.println("Attribute: " + id);
}
}
val = vtdNav.getText();
if (val != -1) {
String author = vtdNav.toNormalizedString(val);
values.add(author);
// System.out.println("\t" + author);
}
navigateToChildren(vtdNav, includeAttributes, values);
}
// autoPilot.resetXPath();
} catch (Exception ex) {
ex.printStackTrace();
}
return values;
}
public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) {
try {
vn.push();
if (vn.toElement(VTDNav.FIRST_CHILD)) {
do {
// printTag(vn);
if (includeAttributes) {
Map<String, String> amap = new LinkedHashMap<String, String>();
loadAttributeMap(vn, amap);
for (String aname : amap.keySet()) {
String aval = amap.get(aname);
values.add(aval);
// System.out.print(" @" + aname + "=" + aval);
}
// System.out.print("\n");
}
int val = vn.getText();
if (val != -1) {
String author = vn.toNormalizedString(val);
values.add(author);
// System.out.println("\t" + author);
}
navigateToChildren(vn, includeAttributes, values);
} while (vn.toElement(VTDNav.NEXT_SIBLING));
}
vn.toElement(VTDNav.PARENT);
vn.pop();
} catch (Exception e) {
e.printStackTrace();
}
}
private static void loadAttributeMap(VTDNav nav, Map<String, String> amap) {
nav.push();
try {
AutoPilot apAtt = new AutoPilot(nav);
apAtt.selectXPath("@*");
int j = -1;
while ((j = apAtt.evalXPath()) != -1) {
String name = nav.toString(j);
String val = nav.toString(j + 1);
amap.put(name, val);
}
} catch (XPathParseException | XPathEvalException | NavException e) {
e.printStackTrace();
}
nav.pop();
}
public static void main(String[] args) {
try {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document document = dBuilder.parse(new File("books.xml"));
VTDXMLReader vtdxmlReader = new VTDXMLReader(document, false, "/catalog/book/snippet");
List<String> xmlFrags = vtdxmlReader.readXML();
for (String xmlFrag : xmlFrags) {
System.out.println(xmlFrag);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
以上代码的输出为:
XML IN-DEPT Developer's Guide
This is an example book for developers want to gain knowledge on
XML
XML parsing and editing
应该是:
XML IN-DEPT Developer's Guide
This is an example book for developers want to gain knowledge on
XML
Marshalling and UnMarshalling. Need to know all about
XML parsing and editing
, Grab this Book!
有什么想法吗?
我想做的事情:
如果以下是 html 文档中的段落标记:
<p>This is an example book for developers want to gain knowledge on <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p>
我想写一个 Reader 从左到右读取它,包括属性值,如下一行一行:
==> This is an example book for developers want to gain knowledge on
==> boldcls xml XML
==> Marshalling and UnMarshalling. Need to know all about
==> boldcls tech XML parsing and editing
==> , Grab this Book!
目前我正在使用 XMLEventReader 执行此操作,我想将其替换为 VTD-XML 库代码。
我对您的 navigateToChildren 子例程进行了轻微的 mod...我调用了 VTDNav 的 getXPathStringVal() 来获取所有文本节点...基本上,问题是 getText() 对于以数据为中心的工作正常xml 文档...对于以文档为中心的用例,您应该调用 getXPathStringVal() 方法直接提取文本节点...此方法在较新版本的 vtd-xml 中可用。这是您要找的吗?
public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) {
try {
vn.push();
if (vn.toElement(VTDNav.FIRST_CHILD)) {
do {
//printTag(vn);
if (includeAttributes) {
Map<String, String> amap = new LinkedHashMap<String, String>();
loadAttributeMap(vn, amap);
for (String aname : amap.keySet()) {
String aval = amap.get(aname);
values.add(aval);
System.out.print(" ==>@" + aname + "=" + aval);
}
// System.out.print("\n");
}
int val = vn.getText();
if (val != -1) {
String author = vn.getXPathStringVal();
values.add(author);
System.out.println("==>\t" + author);
}
navigateToChildren(vn, includeAttributes, values);
} while (vn.toElement(VTDNav.NEXT_SIBLING));
}
vn.toElement(VTDNav.PARENT);
vn.pop();
} catch (Exception e) {
e.printStackTrace();
}
}
第二次编辑:我写了一个小应用程序,它执行所有从属文本和 attr val 连接。基本上它使用索引值直接访问底层 VTD 缓冲区...并按顺序扫描 VTD 记录。如果令牌类型是 attr val 或字符数据,应用程序将其附加到字符串缓冲区...
import com.ximpleware.*;
public class collectTokens {
public static void main(String[] s) throws VTDException{
VTDGen vg = new VTDGen();
if (!vg.parseFile("d:\xml\books.xml", true)){
return;
}
VTDNav vn = vg.getNav();
AutoPilot ap = new AutoPilot(vn);
ap.selectXPath("/catalog/book/snippet/inlineXML/html/body/p");
int i=ap.evalXPath();
// i points to the p element node
if (i!=-1){
int j = vn.getCurrentIndex();// get the token index of p
int d = vn.getTokenDepth(j);
int count = vn.getTokenCount();
int index=j+1;
// collect the text of all text and attr vals sequentially
StringBuilder sb = new StringBuilder(50);
while((index<count)){
if (vn.getTokenDepth(index)==d
&& vn.getTokenDepth(index)== VTDNav.TOKEN_STARTING_TAG)
break;
if (vn.getTokenType(index)== VTDNav.TOKEN_CHARACTER_DATA
|| vn.getTokenType(index)==VTDNav.TOKEN_ATTR_VAL){
sb.append(vn.toString(index)+" ");
}
index++;
}
System.out.println(sb);
}
}
}
<?xml version="1.0"?>
<catalog>
<book id="bk001" type='fiction'>
<author>Gambardella, Matthew</author>
<author>Doe, John</author>
<title>XML IN-DEPT Developer's Guide</title>
<genre>Computer</genre>
<price>44.95</price>
<snippet>
<inlineXML contenttype="application/xhtml+xml" >
<html lang="en-US" >
<head>
<title>XML IN-DEPT Developer's Guide</title>
</head>
<body>
<p>This is an example book for developers want to gain knowledge on <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p>
</body>
</html>
</inlineXML>
</snippet>
</book>
</catalog>
以上是 XML 示例,我想计算 XPath 表达式“/book/snippet”并遍历所有元素并获取文本。我正在使用 VTD-XML 库使用此 () 修改后的代码(如下面的更新中所示),但问题是它在遇到 span 标记后无法获取文本。所以我现在得到的段落标签的输出是:
Level [6] Tag [p]
This is an example book for developers want to gain knowledge on
Level [7] Tag [span] @class=boldcls
XML
Level [8] Tag [span] @class=boldcls
XML parsing and editing
错了,应该是:
Level [6] Tag [p]
This is an example book for developers want to gain knowledge on XML Marshalling and UnMarshalling. Need to know all about XML parsing and editing, Grab this Book!
Level [7] Tag [span] @class=boldcls
XML
Level [8] Tag [span] @class=boldcls
XML parsing and editing
更新: 我稍微修改了示例代码:
package com.vtd.test;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import com.ximpleware.AutoPilot;
import com.ximpleware.NavException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
public class VTDXMLReader {
// private String xpathExpression;
private VTDNav vtdNav;
private AutoPilot autoPilot;
private boolean includeAttributes;
private String attribute;
public VTDXMLReader(final Document storyDoc, final boolean includeAttributes, final String xpathExpression) {
this.includeAttributes = includeAttributes;
// this.xpathExpression = xpathExpression;
final VTDGen vtdGen = new VTDGen();
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
DOMSource source = new DOMSource(storyDoc);
StreamResult result = new StreamResult(baos);
transformer.transform(source, result);
byte[] array = baos.toByteArray();
vtdGen.setDoc(array);
vtdGen.parse(true);
} catch (Exception ex) {
ex.printStackTrace();
}
vtdNav = vtdGen.getNav();
autoPilot = new AutoPilot(vtdNav);
String[] xpathFrags = xpathExpression.split("/");
if (xpathFrags[xpathFrags.length - 1].startsWith("@")) {
attribute = xpathFrags[xpathFrags.length - 1].replaceAll("@", "");
}
try {
autoPilot.selectXPath(xpathExpression);
} catch (XPathParseException e) {
e.printStackTrace();
}
}
public List<String> readXML() throws IOException {
List<String> values = new ArrayList<String>();
try {
while (autoPilot.evalXPath() != -1) {
// printTag(vn);
if (includeAttributes) {
Map<String, String> amap = new LinkedHashMap<String, String>();
loadAttributeMap(vtdNav, amap);
for (String aname : amap.keySet()) {
String aval = amap.get(aname);
values.add(aval);
// System.out.print(" @" + aname + "=" + aval);
}
// System.out.print("\n");
}
int val = 0;
if (attribute != null && !attribute.isEmpty()) {
val = vtdNav.getAttrVal(attribute);
if (val != -1) {
String id = vtdNav.toNormalizedString(val);
values.add(id);
// System.out.println("Attribute: " + id);
}
}
val = vtdNav.getText();
if (val != -1) {
String author = vtdNav.toNormalizedString(val);
values.add(author);
// System.out.println("\t" + author);
}
navigateToChildren(vtdNav, includeAttributes, values);
}
// autoPilot.resetXPath();
} catch (Exception ex) {
ex.printStackTrace();
}
return values;
}
public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) {
try {
vn.push();
if (vn.toElement(VTDNav.FIRST_CHILD)) {
do {
// printTag(vn);
if (includeAttributes) {
Map<String, String> amap = new LinkedHashMap<String, String>();
loadAttributeMap(vn, amap);
for (String aname : amap.keySet()) {
String aval = amap.get(aname);
values.add(aval);
// System.out.print(" @" + aname + "=" + aval);
}
// System.out.print("\n");
}
int val = vn.getText();
if (val != -1) {
String author = vn.toNormalizedString(val);
values.add(author);
// System.out.println("\t" + author);
}
navigateToChildren(vn, includeAttributes, values);
} while (vn.toElement(VTDNav.NEXT_SIBLING));
}
vn.toElement(VTDNav.PARENT);
vn.pop();
} catch (Exception e) {
e.printStackTrace();
}
}
private static void loadAttributeMap(VTDNav nav, Map<String, String> amap) {
nav.push();
try {
AutoPilot apAtt = new AutoPilot(nav);
apAtt.selectXPath("@*");
int j = -1;
while ((j = apAtt.evalXPath()) != -1) {
String name = nav.toString(j);
String val = nav.toString(j + 1);
amap.put(name, val);
}
} catch (XPathParseException | XPathEvalException | NavException e) {
e.printStackTrace();
}
nav.pop();
}
public static void main(String[] args) {
try {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document document = dBuilder.parse(new File("books.xml"));
VTDXMLReader vtdxmlReader = new VTDXMLReader(document, false, "/catalog/book/snippet");
List<String> xmlFrags = vtdxmlReader.readXML();
for (String xmlFrag : xmlFrags) {
System.out.println(xmlFrag);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
以上代码的输出为:
XML IN-DEPT Developer's Guide
This is an example book for developers want to gain knowledge on
XML
XML parsing and editing
应该是:
XML IN-DEPT Developer's Guide
This is an example book for developers want to gain knowledge on
XML
Marshalling and UnMarshalling. Need to know all about
XML parsing and editing
, Grab this Book!
有什么想法吗?
我想做的事情: 如果以下是 html 文档中的段落标记:
<p>This is an example book for developers want to gain knowledge on <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p>
我想写一个 Reader 从左到右读取它,包括属性值,如下一行一行:
==> This is an example book for developers want to gain knowledge on
==> boldcls xml XML
==> Marshalling and UnMarshalling. Need to know all about
==> boldcls tech XML parsing and editing
==> , Grab this Book!
目前我正在使用 XMLEventReader 执行此操作,我想将其替换为 VTD-XML 库代码。
我对您的 navigateToChildren 子例程进行了轻微的 mod...我调用了 VTDNav 的 getXPathStringVal() 来获取所有文本节点...基本上,问题是 getText() 对于以数据为中心的工作正常xml 文档...对于以文档为中心的用例,您应该调用 getXPathStringVal() 方法直接提取文本节点...此方法在较新版本的 vtd-xml 中可用。这是您要找的吗?
public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) {
try {
vn.push();
if (vn.toElement(VTDNav.FIRST_CHILD)) {
do {
//printTag(vn);
if (includeAttributes) {
Map<String, String> amap = new LinkedHashMap<String, String>();
loadAttributeMap(vn, amap);
for (String aname : amap.keySet()) {
String aval = amap.get(aname);
values.add(aval);
System.out.print(" ==>@" + aname + "=" + aval);
}
// System.out.print("\n");
}
int val = vn.getText();
if (val != -1) {
String author = vn.getXPathStringVal();
values.add(author);
System.out.println("==>\t" + author);
}
navigateToChildren(vn, includeAttributes, values);
} while (vn.toElement(VTDNav.NEXT_SIBLING));
}
vn.toElement(VTDNav.PARENT);
vn.pop();
} catch (Exception e) {
e.printStackTrace();
}
}
第二次编辑:我写了一个小应用程序,它执行所有从属文本和 attr val 连接。基本上它使用索引值直接访问底层 VTD 缓冲区...并按顺序扫描 VTD 记录。如果令牌类型是 attr val 或字符数据,应用程序将其附加到字符串缓冲区...
import com.ximpleware.*;
public class collectTokens {
public static void main(String[] s) throws VTDException{
VTDGen vg = new VTDGen();
if (!vg.parseFile("d:\xml\books.xml", true)){
return;
}
VTDNav vn = vg.getNav();
AutoPilot ap = new AutoPilot(vn);
ap.selectXPath("/catalog/book/snippet/inlineXML/html/body/p");
int i=ap.evalXPath();
// i points to the p element node
if (i!=-1){
int j = vn.getCurrentIndex();// get the token index of p
int d = vn.getTokenDepth(j);
int count = vn.getTokenCount();
int index=j+1;
// collect the text of all text and attr vals sequentially
StringBuilder sb = new StringBuilder(50);
while((index<count)){
if (vn.getTokenDepth(index)==d
&& vn.getTokenDepth(index)== VTDNav.TOKEN_STARTING_TAG)
break;
if (vn.getTokenType(index)== VTDNav.TOKEN_CHARACTER_DATA
|| vn.getTokenType(index)==VTDNav.TOKEN_ATTR_VAL){
sb.append(vn.toString(index)+" ");
}
index++;
}
System.out.println(sb);
}
}
}