VTD-XML - 无法在 span 标记后获取文本

VTD-XML - Not able to get text after span tag

<?xml version="1.0"?>
<catalog>
    <book id="bk001" type='fiction'>
        <author>Gambardella, Matthew</author>
        <author>Doe, John</author>
        <title>XML IN-DEPT Developer's Guide</title>
        <genre>Computer</genre>
        <price>44.95</price>
        <snippet>
            <inlineXML contenttype="application/xhtml+xml" >
                <html lang="en-US" >
                    <head>
                        <title>XML IN-DEPT Developer's Guide</title>
                    </head>
                    <body>
                        <p>This is an example book for developers want to gain knowledge on  <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p>
                    </body>
                </html>
            </inlineXML>
        </snippet>
    </book>
</catalog>

以上是 XML 示例,我想计算 XPath 表达式“/book/snippet”并遍历所有元素并获取文本。我正在使用 VTD-XML 库使用此 () 修改后的代码(如下面的更新中所示),但问题是它在遇到 span 标记后无法获取文本。所以我现在得到的段落标签的输出是:

    Level [6] Tag [p]   
            This is an example book for developers want to gain knowledge on
    Level [7] Tag [span] @class=boldcls
            XML
    Level [8] Tag [span] @class=boldcls
            XML parsing and editing

错了,应该是:

    Level [6] Tag [p]   
            This is an example book for developers want to gain knowledge on XML Marshalling and UnMarshalling. Need to know all about XML parsing and editing, Grab this Book!
    Level [7] Tag [span] @class=boldcls
            XML
    Level [8] Tag [span] @class=boldcls
            XML parsing and editing

更新: 我稍微修改了示例代码:

package com.vtd.test;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;

import com.ximpleware.AutoPilot;
import com.ximpleware.NavException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;

public class VTDXMLReader {

    // private String xpathExpression;

    private VTDNav vtdNav;

    private AutoPilot autoPilot;

    private boolean includeAttributes;

    private String attribute;

    public VTDXMLReader(final Document storyDoc, final boolean includeAttributes, final String xpathExpression) {
        this.includeAttributes = includeAttributes;
        // this.xpathExpression = xpathExpression;
        final VTDGen vtdGen = new VTDGen();
        try {
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            TransformerFactory transformerFactory = TransformerFactory.newInstance();
            Transformer transformer = transformerFactory.newTransformer();
            DOMSource source = new DOMSource(storyDoc);
            StreamResult result = new StreamResult(baos);
            transformer.transform(source, result);
            byte[] array = baos.toByteArray();

            vtdGen.setDoc(array);
            vtdGen.parse(true);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        vtdNav = vtdGen.getNav();
        autoPilot = new AutoPilot(vtdNav);
        String[] xpathFrags = xpathExpression.split("/");
        if (xpathFrags[xpathFrags.length - 1].startsWith("@")) {
            attribute = xpathFrags[xpathFrags.length - 1].replaceAll("@", "");
        }
        try {
            autoPilot.selectXPath(xpathExpression);
        } catch (XPathParseException e) {
            e.printStackTrace();
        }
    }

    public List<String> readXML() throws IOException {
        List<String> values = new ArrayList<String>();
        try {
            while (autoPilot.evalXPath() != -1) {
                // printTag(vn);
                if (includeAttributes) {
                    Map<String, String> amap = new LinkedHashMap<String, String>();

                    loadAttributeMap(vtdNav, amap);

                    for (String aname : amap.keySet()) {
                        String aval = amap.get(aname);
                        values.add(aval);
                        // System.out.print(" @" + aname + "=" + aval);
                    }
                    // System.out.print("\n");
                }
                int val = 0;
                if (attribute != null && !attribute.isEmpty()) {
                    val = vtdNav.getAttrVal(attribute);
                    if (val != -1) {
                        String id = vtdNav.toNormalizedString(val);
                        values.add(id);
                        // System.out.println("Attribute: " + id);
                    }
                }
                val = vtdNav.getText();
                if (val != -1) {
                    String author = vtdNav.toNormalizedString(val);
                    values.add(author);
                    // System.out.println("\t" + author);
                }
                navigateToChildren(vtdNav, includeAttributes, values);

            }
            // autoPilot.resetXPath();
        } catch (Exception ex) {
            ex.printStackTrace();
        }

        return values;

    }

    public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) {
        try {
            vn.push();
            if (vn.toElement(VTDNav.FIRST_CHILD)) {
                do {
                    // printTag(vn);

                    if (includeAttributes) {
                        Map<String, String> amap = new LinkedHashMap<String, String>();

                        loadAttributeMap(vn, amap);

                        for (String aname : amap.keySet()) {
                            String aval = amap.get(aname);
                            values.add(aval);
                            // System.out.print(" @" + aname + "=" + aval);
                        }
                        // System.out.print("\n");
                    }

                    int val = vn.getText();
                    if (val != -1) {
                        String author = vn.toNormalizedString(val);
                        values.add(author);
                        // System.out.println("\t" + author);
                    }
                    navigateToChildren(vn, includeAttributes, values);
                } while (vn.toElement(VTDNav.NEXT_SIBLING));
            }
            vn.toElement(VTDNav.PARENT);
            vn.pop();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static void loadAttributeMap(VTDNav nav, Map<String, String> amap) {

        nav.push();

        try {
            AutoPilot apAtt = new AutoPilot(nav);
            apAtt.selectXPath("@*");

            int j = -1;
            while ((j = apAtt.evalXPath()) != -1) {
                String name = nav.toString(j);
                String val = nav.toString(j + 1);

                amap.put(name, val);
            }
        } catch (XPathParseException | XPathEvalException | NavException e) {
            e.printStackTrace();
        }

        nav.pop();
    }

    public static void main(String[] args) {
        try {
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            Document document = dBuilder.parse(new File("books.xml"));

            VTDXMLReader vtdxmlReader = new VTDXMLReader(document, false, "/catalog/book/snippet");
            List<String> xmlFrags = vtdxmlReader.readXML();
            for (String xmlFrag : xmlFrags) {
                System.out.println(xmlFrag);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

以上代码的输出为:

XML IN-DEPT Developer's Guide
This is an example book for developers want to gain knowledge on
XML
XML parsing and editing

应该是:

XML IN-DEPT Developer's Guide
This is an example book for developers want to gain knowledge on
XML
Marshalling and UnMarshalling. Need to know all about
XML parsing and editing
, Grab this Book!

有什么想法吗?

我想做的事情: 如果以下是 html 文档中的段落标记:

<p>This is an example book for developers want to gain knowledge on  <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p>

我想写一个 Reader 从左到右读取它,包括属性值,如下一行一行:

==> This is an example book for developers want to gain knowledge on
==> boldcls xml XML
==> Marshalling and UnMarshalling. Need to know all about
==> boldcls tech XML parsing and editing
==> , Grab this Book!

目前我正在使用 XMLEventReader 执行此操作,我想将其替换为 VTD-XML 库代码。

我对您的 navigateToChildren 子例程进行了轻微的 mod...我调用了 VTDNav 的 getXPathStringVal() 来获取所有文本节点...基本上,问题是 getText() 对于以数据为中心的工作正常xml 文档...对于以文档为中心的用例,您应该调用 getXPathStringVal() 方法直接提取文本节点...此方法在较新版本的 vtd-xml 中可用。这是您要找的吗?

public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) {
        try {
            vn.push();
            if (vn.toElement(VTDNav.FIRST_CHILD)) {
                do {
                    //printTag(vn);

                    if (includeAttributes) {
                        Map<String, String> amap = new LinkedHashMap<String, String>();

                        loadAttributeMap(vn, amap);

                        for (String aname : amap.keySet()) {
                            String aval = amap.get(aname);
                            values.add(aval);
                            System.out.print(" ==>@" + aname + "=" + aval);
                        }
                        // System.out.print("\n");
                    }

                    int val = vn.getText();

                    if (val != -1) {
                        String author = vn.getXPathStringVal();
                        values.add(author);
                        System.out.println("==>\t" + author);
                    }
                    navigateToChildren(vn, includeAttributes, values);
                } while (vn.toElement(VTDNav.NEXT_SIBLING));
            }
            vn.toElement(VTDNav.PARENT);
            vn.pop();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

第二次编辑:我写了一个小应用程序,它执行所有从属文本和 attr val 连接。基本上它使用索引值直接访问底层 VTD 缓冲区...并按顺序扫描 VTD 记录。如果令牌类型是 attr val 或字符数据,应用程序将其附加到字符串缓冲区...

import com.ximpleware.*;

public class collectTokens {
    public static void main(String[] s) throws VTDException{
        VTDGen vg = new VTDGen();
        if (!vg.parseFile("d:\xml\books.xml", true)){
            return;
        }
        VTDNav vn = vg.getNav();
        AutoPilot ap = new AutoPilot(vn);
        ap.selectXPath("/catalog/book/snippet/inlineXML/html/body/p");
        int i=ap.evalXPath();
        // i points to the p element node
        if (i!=-1){
            int j = vn.getCurrentIndex();// get the token index of p
            int d = vn.getTokenDepth(j);
            int count = vn.getTokenCount();
            int index=j+1;
            // collect the text of all text and attr vals  sequentially
            StringBuilder sb = new StringBuilder(50);
            while((index<count)){
                if (vn.getTokenDepth(index)==d 
                        && vn.getTokenDepth(index)== VTDNav.TOKEN_STARTING_TAG)
                    break;
                if (vn.getTokenType(index)== VTDNav.TOKEN_CHARACTER_DATA
                        || vn.getTokenType(index)==VTDNav.TOKEN_ATTR_VAL){
                            sb.append(vn.toString(index)+" ");
                        }
                index++;
            }
            System.out.println(sb);
        }
    }
}