Java 复杂/大型 Stax XML
Java Stax for Complex / Large XML
我有一个 4.2 GB 的 XML 文件!显然解析整个 DOM 是不切实际的。我一直在研究 SAX 和 STAX 来完成解析这个巨大的 XML 文件。然而,我见过的所有例子都很简单。我正在处理的 XML 文件已嵌套在嵌套上。有些区域超过 10 个级别。
我找到了这个教程,但不确定它是否是一个可行的解决方案。
http://www.javacodegeeks.com/2013/05/parsing-xml-using-dom-sax-and-stax-parser-in-java.html(使用 STAX 的底部示例)
我不太确定如何处理嵌套对象。
我创建了 Java 个对象来模仿 XML 的结构。这里有一些,太多了无法展示。
Record.java
public class Record implements Serializable {
String uid;
StaticData staticData;
DynamicData dynamicData;
}
Summary.java
public class Summary {
EWUID ewuid;
PubInfo pubInfo;
Titles titles;
Names names;
DocTypes docTypes;
Publishers publishers;
}
EWUID.java
public class EWUID {
String collId;
String edition;
}
PubInfo.java
public class PubInfo {
String coverDate;
String hasAbstract;
String issue;
String pubMonth;
String pubType;
String pubYear;
String sortDate;
String volume;
}
这是我到目前为止想出的代码。
public class TRWOSParser {
XMLEventReader eventReader;
XMLInputFactory inputFactory;
InputStream inputStream;
public TRWOSParser(String file) throws FileNotFoundException, XMLStreamException {
inputFactory = XMLInputFactory.newInstance();
inputStream = new FileInputStream(file);
eventReader = inputFactory.createXMLEventReader(inputStream);
}
public void parse() throws XMLStreamException{
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
if (event.isStartElement()) {
StartElement startElement = event.asStartElement();
if (startElement.getName().getLocalPart().equals("record")) {
Record record = new Record();
Iterator<Attribute> attributes = startElement.getAttributes();
while (attributes.hasNext()) {
Attribute attribute = attributes.next();
if (attribute.getName().toString().equals("UID")) {
System.out.println("UID: " + attribute.getValue());
}
}
}
}
}
}
}
更新:
XML 中的数据已获得许可,因此我无法显示完整文件。这是一个非常非常小的片段,我在其中打乱了数据。
<?xml version="1.0" encoding="UTF-8"?>
<records>
<REC>
<UID>WOS:000310438600004</UID>
<static_data>
<summary>
<EWUID>
<WUID coll_id="WOS" />
<edition value="WOS.SCI" />
</EWUID>
<pub_info coverdate="NOV 2012" has_abstract="N" issue="5" pubmonth="NOV" pubtype="Journal" pubyear="2012" sortdate="2012-11-01" vol="188">
<page begin="1662" end="1663" page_count="2">1662-1663</page>
</pub_info>
<titles count="6">
<title type="source">JOURNAL OF UROLOGY</title>
<title type="source_abbrev">J UROLOGY</title>
<title type="abbrev_iso">J. Urol.</title>
<title type="abbrev_11">J UROL</title>
<title type="abbrev_29">J UROL</title>
<title type="item">Something something</title>
</titles>
<names count="1">
<name addr_no="1 2 3" reprint="Y" role="author" seq_no="1">
<display_name>John Doe</display_name>
<full_name>John Doe</full_name>
<wos_standard>Doe, John</wos_standard>
<first_name>John</first_name>
<last_name>Doe</last_name>
</name>
</names>
<doctypes count="1">
<doctype>Editorial Material</doctype>
</doctypes>
<publishers>
<publisher>
<address_spec addr_no="1">
<full_address>360 PARK AVE SOUTH, NEW YORK, NY 10010-1710 USA</full_address>
<city>NEW YORK</city>
</address_spec>
<names count="1">
<name addr_no="1" role="publisher" seq_no="1">
<display_name>ELSEVIER SCIENCE INC</display_name>
<full_name>ELSEVIER SCIENCE INC</full_name>
</name>
</names>
</publisher>
</publishers>
</summary>
</static_data>
</REC>
</records>
我做了两个假设 1) 存在早期的重复,以及 2) 您可以对部分文档做一些有意义的事情。
假设您可以移动某个级别的嵌套,然后多次处理文档,每次 "handle" 文档时删除工作级别的节点。这意味着在任何给定时间只有一个工作子树将在内存中。
这是一个有效的代码片段:
package bigparse;
import static javax.xml.stream.XMLStreamConstants.CHARACTERS;
import static javax.xml.stream.XMLStreamConstants.END_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.END_ELEMENT;
import static javax.xml.stream.XMLStreamConstants.START_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.StringWriter;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class BigParse {
public static void main(String... args) {
XMLInputFactory factory = XMLInputFactory.newInstance();
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
try {
XMLStreamReader streamReader = factory.createXMLStreamReader(new FileReader("src/main/resources/test.xml"));
DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
Document document = documentBuilder.newDocument();
Element rootElement = null;
Element currentElement = null;
int branchLevel = 0;
int maxBranchLevel = 1;
while (streamReader.hasNext()) {
int event = streamReader.next();
switch (event) {
case START_DOCUMENT:
continue;
case START_ELEMENT:
if (branchLevel < maxBranchLevel) {
Element workingElement = readElementOnly(streamReader, document);
if (rootElement == null) {
document.appendChild(workingElement);
rootElement = document.getDocumentElement();
currentElement = rootElement;
} else {
currentElement.appendChild(workingElement);
currentElement = workingElement;
}
branchLevel++;
} else {
workingLoop(streamReader, document, currentElement);
}
continue;
case CHARACTERS:
currentElement.setTextContent(streamReader.getText());
continue;
case END_ELEMENT:
if (currentElement != rootElement) {
currentElement = (Element) currentElement.getParentNode();
branchLevel--;
}
continue;
case END_DOCUMENT:
break;
}
}
} catch (ParserConfigurationException
| FileNotFoundException
| XMLStreamException e) {
throw new RuntimeException(e);
}
}
private static Element readElementOnly(XMLStreamReader streamReader, Document document) {
Element workingElement = document.createElement(streamReader.getLocalName());
for (int attributeIndex = 0; attributeIndex < streamReader.getAttributeCount(); attributeIndex++) {
workingElement.setAttribute(
streamReader.getAttributeLocalName(attributeIndex),
streamReader.getAttributeValue(attributeIndex));
}
return workingElement;
}
private static void workingLoop(final XMLStreamReader streamReader, final Document document, final Element fragmentRoot)
throws XMLStreamException {
Element startElement = readElementOnly(streamReader, document);
fragmentRoot.appendChild(startElement);
Element currentElement = startElement;
while (streamReader.hasNext()) {
int event = streamReader.next();
switch (event) {
case START_DOCUMENT:
continue;
case START_ELEMENT:
Element workingElement = readElementOnly(streamReader, document);
currentElement.appendChild(workingElement);
currentElement = workingElement;
continue;
case CHARACTERS:
currentElement.setTextContent(streamReader.getText());
continue;
case END_ELEMENT:
if (currentElement != startElement) {
currentElement = (Element) currentElement.getParentNode();
continue;
} else {
handleDocument(document, startElement);
fragmentRoot.removeChild(startElement);
startElement = null;
return;
}
}
}
}
// THIS FUNCTION DOES SOMETHING MEANINFUL
private static void handleDocument(Document document, Element startElement) {
System.out.println(stringify(document));
}
private static String stringify(Document document) {
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
StreamResult result = new StreamResult(new StringWriter());
DOMSource source = new DOMSource(document);
transformer.transform(source, result);
String xmlString = result.getWriter().toString();
return xmlString;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
编辑:我犯了一个非常愚蠢的错误。现在修好了。它有效但不完美 -- 应该足以引导您朝着有用的方向前进。
与 lscoughlin 的答案类似的解决方案是使用 DOM4J,它具有处理这种情况的机制:http://dom4j.sourceforge.net/
在我看来,它更直接也更容易理解。不过,它可能不支持命名空间。
考虑使用以下形式的 XSLT 3.0 流转换:
<xsl:template name="main">
<xsl:stream href="bigInput.xml">
<xsl:for-each select="copy-of(/records/REC)">
<!-- process one record -->
</xsl:for-each>
</xsl:stream>
</xsl:template>
您可以使用 Saxon-EE 9.6 进行处理。
"process one record" 逻辑可以使用 Saxon SQL 扩展,或者它可以调用扩展函数:上下文节点将是一个 REC 元素及其包含的树,在子树中完全可导航,但无法导航到当前正在处理的 REC 元素之外。
我有一个 4.2 GB 的 XML 文件!显然解析整个 DOM 是不切实际的。我一直在研究 SAX 和 STAX 来完成解析这个巨大的 XML 文件。然而,我见过的所有例子都很简单。我正在处理的 XML 文件已嵌套在嵌套上。有些区域超过 10 个级别。
我找到了这个教程,但不确定它是否是一个可行的解决方案。
http://www.javacodegeeks.com/2013/05/parsing-xml-using-dom-sax-and-stax-parser-in-java.html(使用 STAX 的底部示例)
我不太确定如何处理嵌套对象。
我创建了 Java 个对象来模仿 XML 的结构。这里有一些,太多了无法展示。
Record.java
public class Record implements Serializable {
String uid;
StaticData staticData;
DynamicData dynamicData;
}
Summary.java
public class Summary {
EWUID ewuid;
PubInfo pubInfo;
Titles titles;
Names names;
DocTypes docTypes;
Publishers publishers;
}
EWUID.java
public class EWUID {
String collId;
String edition;
}
PubInfo.java
public class PubInfo {
String coverDate;
String hasAbstract;
String issue;
String pubMonth;
String pubType;
String pubYear;
String sortDate;
String volume;
}
这是我到目前为止想出的代码。
public class TRWOSParser {
XMLEventReader eventReader;
XMLInputFactory inputFactory;
InputStream inputStream;
public TRWOSParser(String file) throws FileNotFoundException, XMLStreamException {
inputFactory = XMLInputFactory.newInstance();
inputStream = new FileInputStream(file);
eventReader = inputFactory.createXMLEventReader(inputStream);
}
public void parse() throws XMLStreamException{
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
if (event.isStartElement()) {
StartElement startElement = event.asStartElement();
if (startElement.getName().getLocalPart().equals("record")) {
Record record = new Record();
Iterator<Attribute> attributes = startElement.getAttributes();
while (attributes.hasNext()) {
Attribute attribute = attributes.next();
if (attribute.getName().toString().equals("UID")) {
System.out.println("UID: " + attribute.getValue());
}
}
}
}
}
}
}
更新:
XML 中的数据已获得许可,因此我无法显示完整文件。这是一个非常非常小的片段,我在其中打乱了数据。
<?xml version="1.0" encoding="UTF-8"?>
<records>
<REC>
<UID>WOS:000310438600004</UID>
<static_data>
<summary>
<EWUID>
<WUID coll_id="WOS" />
<edition value="WOS.SCI" />
</EWUID>
<pub_info coverdate="NOV 2012" has_abstract="N" issue="5" pubmonth="NOV" pubtype="Journal" pubyear="2012" sortdate="2012-11-01" vol="188">
<page begin="1662" end="1663" page_count="2">1662-1663</page>
</pub_info>
<titles count="6">
<title type="source">JOURNAL OF UROLOGY</title>
<title type="source_abbrev">J UROLOGY</title>
<title type="abbrev_iso">J. Urol.</title>
<title type="abbrev_11">J UROL</title>
<title type="abbrev_29">J UROL</title>
<title type="item">Something something</title>
</titles>
<names count="1">
<name addr_no="1 2 3" reprint="Y" role="author" seq_no="1">
<display_name>John Doe</display_name>
<full_name>John Doe</full_name>
<wos_standard>Doe, John</wos_standard>
<first_name>John</first_name>
<last_name>Doe</last_name>
</name>
</names>
<doctypes count="1">
<doctype>Editorial Material</doctype>
</doctypes>
<publishers>
<publisher>
<address_spec addr_no="1">
<full_address>360 PARK AVE SOUTH, NEW YORK, NY 10010-1710 USA</full_address>
<city>NEW YORK</city>
</address_spec>
<names count="1">
<name addr_no="1" role="publisher" seq_no="1">
<display_name>ELSEVIER SCIENCE INC</display_name>
<full_name>ELSEVIER SCIENCE INC</full_name>
</name>
</names>
</publisher>
</publishers>
</summary>
</static_data>
</REC>
</records>
我做了两个假设 1) 存在早期的重复,以及 2) 您可以对部分文档做一些有意义的事情。
假设您可以移动某个级别的嵌套,然后多次处理文档,每次 "handle" 文档时删除工作级别的节点。这意味着在任何给定时间只有一个工作子树将在内存中。
这是一个有效的代码片段:
package bigparse;
import static javax.xml.stream.XMLStreamConstants.CHARACTERS;
import static javax.xml.stream.XMLStreamConstants.END_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.END_ELEMENT;
import static javax.xml.stream.XMLStreamConstants.START_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.StringWriter;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class BigParse {
public static void main(String... args) {
XMLInputFactory factory = XMLInputFactory.newInstance();
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
try {
XMLStreamReader streamReader = factory.createXMLStreamReader(new FileReader("src/main/resources/test.xml"));
DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
Document document = documentBuilder.newDocument();
Element rootElement = null;
Element currentElement = null;
int branchLevel = 0;
int maxBranchLevel = 1;
while (streamReader.hasNext()) {
int event = streamReader.next();
switch (event) {
case START_DOCUMENT:
continue;
case START_ELEMENT:
if (branchLevel < maxBranchLevel) {
Element workingElement = readElementOnly(streamReader, document);
if (rootElement == null) {
document.appendChild(workingElement);
rootElement = document.getDocumentElement();
currentElement = rootElement;
} else {
currentElement.appendChild(workingElement);
currentElement = workingElement;
}
branchLevel++;
} else {
workingLoop(streamReader, document, currentElement);
}
continue;
case CHARACTERS:
currentElement.setTextContent(streamReader.getText());
continue;
case END_ELEMENT:
if (currentElement != rootElement) {
currentElement = (Element) currentElement.getParentNode();
branchLevel--;
}
continue;
case END_DOCUMENT:
break;
}
}
} catch (ParserConfigurationException
| FileNotFoundException
| XMLStreamException e) {
throw new RuntimeException(e);
}
}
private static Element readElementOnly(XMLStreamReader streamReader, Document document) {
Element workingElement = document.createElement(streamReader.getLocalName());
for (int attributeIndex = 0; attributeIndex < streamReader.getAttributeCount(); attributeIndex++) {
workingElement.setAttribute(
streamReader.getAttributeLocalName(attributeIndex),
streamReader.getAttributeValue(attributeIndex));
}
return workingElement;
}
private static void workingLoop(final XMLStreamReader streamReader, final Document document, final Element fragmentRoot)
throws XMLStreamException {
Element startElement = readElementOnly(streamReader, document);
fragmentRoot.appendChild(startElement);
Element currentElement = startElement;
while (streamReader.hasNext()) {
int event = streamReader.next();
switch (event) {
case START_DOCUMENT:
continue;
case START_ELEMENT:
Element workingElement = readElementOnly(streamReader, document);
currentElement.appendChild(workingElement);
currentElement = workingElement;
continue;
case CHARACTERS:
currentElement.setTextContent(streamReader.getText());
continue;
case END_ELEMENT:
if (currentElement != startElement) {
currentElement = (Element) currentElement.getParentNode();
continue;
} else {
handleDocument(document, startElement);
fragmentRoot.removeChild(startElement);
startElement = null;
return;
}
}
}
}
// THIS FUNCTION DOES SOMETHING MEANINFUL
private static void handleDocument(Document document, Element startElement) {
System.out.println(stringify(document));
}
private static String stringify(Document document) {
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
StreamResult result = new StreamResult(new StringWriter());
DOMSource source = new DOMSource(document);
transformer.transform(source, result);
String xmlString = result.getWriter().toString();
return xmlString;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
编辑:我犯了一个非常愚蠢的错误。现在修好了。它有效但不完美 -- 应该足以引导您朝着有用的方向前进。
与 lscoughlin 的答案类似的解决方案是使用 DOM4J,它具有处理这种情况的机制:http://dom4j.sourceforge.net/
在我看来,它更直接也更容易理解。不过,它可能不支持命名空间。
考虑使用以下形式的 XSLT 3.0 流转换:
<xsl:template name="main">
<xsl:stream href="bigInput.xml">
<xsl:for-each select="copy-of(/records/REC)">
<!-- process one record -->
</xsl:for-each>
</xsl:stream>
</xsl:template>
您可以使用 Saxon-EE 9.6 进行处理。
"process one record" 逻辑可以使用 Saxon SQL 扩展,或者它可以调用扩展函数:上下文节点将是一个 REC 元素及其包含的树,在子树中完全可导航,但无法导航到当前正在处理的 REC 元素之外。