XML 尽快将中型文件转换为 CSV
XML to CSV conversion as fast as possible for medium sized files
所以我查找并尝试了几种方法来将 XML 文件转换为 CSV 文件。我试过的方法是:
XSLT
: 得到给定 XML 的 XSLT
然后形成 CSV。但这太难维护了,因为我们不知道我们将获得什么样的 XML 文件,这使得它不是通用解决方案。
- 使用 Apache
Digester
SAXP
和 JAXP
上述两种方法的问题是它需要在之前定义你的 java 对象,因此创建这么多 类 又是一个瓶颈,因为我们不知道 XML 的模式会是什么在那里。每次都会改变。
- 使用
DocumentBuildFactory
并一直解析它。这适用于通用 XML 文件,但对于 5MB 到 1GB 范围内的文件来说速度很慢。我的 XML 文件肯定不会超过 1GB。
除了我已经尝试过的这些方法之外,关于如何以编程方式比上面更快地实现它的任何想法?我看过几个在线工具,它们可以在非常短的时间内将任何 XML 文件转换为 CSV 文件,而且它们似乎适用于任何通用 XML 文件。有什么建议吗?
以下是可能出现的不同示例,它们也可能会发生变化:
<?xml version="1.0"?>
<Company>
<Employee id="1">
<Email>tp@xyz.com</Email>
<artist>Bob Dylan</artist>
<country>USA</country>
</Employee>
</Company>
这是最简单的一个。预期输出为:
Company/Employee/Email,Company/Employee/artist,Company/Employee/country,Company/Employee/_id
tp@xyz.com,Bob Dylan,USA,1
另一个例子
<?xml version="1.0"?>
<Company>
<Employee id="1">
<Email>tp@xyz.com</Email>
<UserData id="id32" type="AttributesInContext">
<UserValue value="7in" title="Height"></UserValue>
<UserValue value="" title="Weight"></UserValue></UserData>
</Employee>
<Employee id="2">
<Email>tp@xyz.com</Email>
<UserData id="id33" type="AttributesInContext">
<UserValue value="6in" title="Height"></UserValue>
<UserValue value="" title="Weight"></UserValue></UserData>
</Employee>
<Employee id="3">
<Email>tp@xyz.com</Email>
<UserData id="id34" type="AttributesInContext">
<UserValue value="4in" title="Height"></UserValue>
<UserValue value="" title="Weight"></UserValue></UserData>
</Employee>
</Company>
预期输出为
Email,UserData/UserValue/0/_value,UserData/UserValue/0/_title,UserData/UserValue/1/_value,UserData/UserValue/1/_title,UserData/_id,UserData/_type,_id
tp@xyz.com,7in,Height,,Weight,id32,AttributesInContext,1
tp@xyz.com,6in,Height,,Weight,id33,AttributesInContext,2
tp@xyz.com,4in,Height,,Weight,id34,AttributesInContext,3
这有点复杂。这可能会变得更加复杂和嵌套,最大范围可达 1GB。
您可以尝试使用 Java StAX API 来求婚。
例如:
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
public class XmlToCSV {
public static void convert(InputStream xml, OutputStream csv) throws Exception {
try (StringWriter header = new StringWriter(4096); StringWriter content = new StringWriter(4096)) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = factory.createXMLEventReader(xml);
XMLEvent xmlEvent;
long nestingLevel = -1;
StringBuilder line = null;
while (xmlEventReader.hasNext()) {
xmlEvent = xmlEventReader.nextEvent();
switch (xmlEvent.getEventType()) {
case XMLEvent.START_ELEMENT:
++nestingLevel;
if (0 == nestingLevel) {
break;
} else if (1 == nestingLevel) {
line = new StringBuilder();
}
StartElement startElement = xmlEvent.asStartElement();
serializeElementHeader(header, line, startElement);
break;
case XMLEvent.CHARACTERS:
case XMLEvent.CDATA:
if (nestingLevel < 1)
break;
Characters chars = xmlEvent.asCharacters();
if (!chars.isWhiteSpace()) {
line.append(chars.getData());
line.append(',');
}
break;
case XMLEvent.END_ELEMENT:
if (--nestingLevel == 0) {
header.write("0/");
for(int i= line.length()-1; ',' == line.charAt(i); --i) {
line.deleteCharAt(i);
}
content.write(line.toString());
content.write('\n');
}
break;
default:
break;
}
}
// write csv
try (Writer cvsWriter = new OutputStreamWriter(csv, StandardCharsets.UTF_8.name())) {
cvsWriter.write(header.toString());
cvsWriter.write('\n');
cvsWriter.write(content.toString());
}
}
}
private static void serializeElementHeader(StringWriter header, StringBuilder line,
StartElement startElement) {
header.write(startElement.getName().getLocalPart());
header.write('/');
Iterator<Attribute> it = startElement.getAttributes();
while(it.hasNext()) {
Attribute attr = it.next();
header.write('_');
header.write(attr.getName().getLocalPart());
header.write('/');
line.append(attr.getValue());
line.append(',');
}
}
private static String TEST_XML = "<?xml version='1.0'?>"
+ "<Company>"
+ " <Employee id='1'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id32' type='AttributesInContext'>"
+ " <UserValue value='7in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ " <Employee id='2'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id33' type='AttributesInContext'>"
+ " <UserValue value='6in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ " <Employee id='3'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id34' type='AttributesInContext'>"
+ " <UserValue value='4in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ "</Company>";
public static void main(String[] args) throws Exception {
try (InputStream in = new ByteArrayInputStream(TEST_XML.getBytes(Charset.defaultCharset()));
ByteArrayOutputStream out = new ByteArrayOutputStream(4096)) {
convert(in, out);
System.out.print(out.toString());
}
}
所以我查找并尝试了几种方法来将 XML 文件转换为 CSV 文件。我试过的方法是:
XSLT
: 得到给定 XML 的XSLT
然后形成 CSV。但这太难维护了,因为我们不知道我们将获得什么样的 XML 文件,这使得它不是通用解决方案。- 使用 Apache
Digester
SAXP
和JAXP
上述两种方法的问题是它需要在之前定义你的 java 对象,因此创建这么多 类 又是一个瓶颈,因为我们不知道 XML 的模式会是什么在那里。每次都会改变。- 使用
DocumentBuildFactory
并一直解析它。这适用于通用 XML 文件,但对于 5MB 到 1GB 范围内的文件来说速度很慢。我的 XML 文件肯定不会超过 1GB。
除了我已经尝试过的这些方法之外,关于如何以编程方式比上面更快地实现它的任何想法?我看过几个在线工具,它们可以在非常短的时间内将任何 XML 文件转换为 CSV 文件,而且它们似乎适用于任何通用 XML 文件。有什么建议吗?
以下是可能出现的不同示例,它们也可能会发生变化:
<?xml version="1.0"?>
<Company>
<Employee id="1">
<Email>tp@xyz.com</Email>
<artist>Bob Dylan</artist>
<country>USA</country>
</Employee>
</Company>
这是最简单的一个。预期输出为:
Company/Employee/Email,Company/Employee/artist,Company/Employee/country,Company/Employee/_id
tp@xyz.com,Bob Dylan,USA,1
另一个例子
<?xml version="1.0"?>
<Company>
<Employee id="1">
<Email>tp@xyz.com</Email>
<UserData id="id32" type="AttributesInContext">
<UserValue value="7in" title="Height"></UserValue>
<UserValue value="" title="Weight"></UserValue></UserData>
</Employee>
<Employee id="2">
<Email>tp@xyz.com</Email>
<UserData id="id33" type="AttributesInContext">
<UserValue value="6in" title="Height"></UserValue>
<UserValue value="" title="Weight"></UserValue></UserData>
</Employee>
<Employee id="3">
<Email>tp@xyz.com</Email>
<UserData id="id34" type="AttributesInContext">
<UserValue value="4in" title="Height"></UserValue>
<UserValue value="" title="Weight"></UserValue></UserData>
</Employee>
</Company>
预期输出为
Email,UserData/UserValue/0/_value,UserData/UserValue/0/_title,UserData/UserValue/1/_value,UserData/UserValue/1/_title,UserData/_id,UserData/_type,_id
tp@xyz.com,7in,Height,,Weight,id32,AttributesInContext,1
tp@xyz.com,6in,Height,,Weight,id33,AttributesInContext,2
tp@xyz.com,4in,Height,,Weight,id34,AttributesInContext,3
这有点复杂。这可能会变得更加复杂和嵌套,最大范围可达 1GB。
您可以尝试使用 Java StAX API 来求婚。
例如:
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
public class XmlToCSV {
public static void convert(InputStream xml, OutputStream csv) throws Exception {
try (StringWriter header = new StringWriter(4096); StringWriter content = new StringWriter(4096)) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = factory.createXMLEventReader(xml);
XMLEvent xmlEvent;
long nestingLevel = -1;
StringBuilder line = null;
while (xmlEventReader.hasNext()) {
xmlEvent = xmlEventReader.nextEvent();
switch (xmlEvent.getEventType()) {
case XMLEvent.START_ELEMENT:
++nestingLevel;
if (0 == nestingLevel) {
break;
} else if (1 == nestingLevel) {
line = new StringBuilder();
}
StartElement startElement = xmlEvent.asStartElement();
serializeElementHeader(header, line, startElement);
break;
case XMLEvent.CHARACTERS:
case XMLEvent.CDATA:
if (nestingLevel < 1)
break;
Characters chars = xmlEvent.asCharacters();
if (!chars.isWhiteSpace()) {
line.append(chars.getData());
line.append(',');
}
break;
case XMLEvent.END_ELEMENT:
if (--nestingLevel == 0) {
header.write("0/");
for(int i= line.length()-1; ',' == line.charAt(i); --i) {
line.deleteCharAt(i);
}
content.write(line.toString());
content.write('\n');
}
break;
default:
break;
}
}
// write csv
try (Writer cvsWriter = new OutputStreamWriter(csv, StandardCharsets.UTF_8.name())) {
cvsWriter.write(header.toString());
cvsWriter.write('\n');
cvsWriter.write(content.toString());
}
}
}
private static void serializeElementHeader(StringWriter header, StringBuilder line,
StartElement startElement) {
header.write(startElement.getName().getLocalPart());
header.write('/');
Iterator<Attribute> it = startElement.getAttributes();
while(it.hasNext()) {
Attribute attr = it.next();
header.write('_');
header.write(attr.getName().getLocalPart());
header.write('/');
line.append(attr.getValue());
line.append(',');
}
}
private static String TEST_XML = "<?xml version='1.0'?>"
+ "<Company>"
+ " <Employee id='1'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id32' type='AttributesInContext'>"
+ " <UserValue value='7in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ " <Employee id='2'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id33' type='AttributesInContext'>"
+ " <UserValue value='6in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ " <Employee id='3'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id34' type='AttributesInContext'>"
+ " <UserValue value='4in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ "</Company>";
public static void main(String[] args) throws Exception {
try (InputStream in = new ByteArrayInputStream(TEST_XML.getBytes(Charset.defaultCharset()));
ByteArrayOutputStream out = new ByteArrayOutputStream(4096)) {
convert(in, out);
System.out.print(out.toString());
}
}