合并大量 XML 个文件
Merging Lots of XML files
我有很多 xml 个文件需要合并。我在 merging xml files using python's ElementTree 试过这个 link
谁的代码是(根据我的需要编辑):
import os, os.path, sys
import glob
from xml.etree import ElementTree
def run(files):
xml_files = glob.glob(files +"/*.xml")
xml_element_tree = None
for xml_file in xml_files:
print xml_file
data = ElementTree.parse(xml_file).getroot()
# print ElementTree.tostring(data)
for result in data.iter('TALLYMESSAGE'):
if xml_element_tree is None:
xml_element_tree = data
insertion_point = xml_element_tree.findall("./BODY/DATA/TALLYMESSAGE")[0]
else:
insertion_point.extend(result)
if xml_element_tree is not None:
f = open("myxmlfile.xml", "wb")
f.write(ElementTree.tostring(xml_element_tree))
run("F:/data/data")
但问题是我有很多 XML 文件,准确地说是 365 个,每个至少 2 MB。将它们全部合并导致我的电脑崩溃。
这是我的 xml 文件的 xml 树的图像:
我的新更新代码是:
import os, os.path, sys
import glob
from lxml import etree
def XSLFILE(files):
xml_files = glob.glob(files +"/*.xml")
#print xml_files[0]
xslstring = """<?xml version="1.0" ?>
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:template match="/DATA">
<DATA>
<xsl:copy>
<xsl:copy-of select="TALLYMESSAGE"/>\n"""
#print xslstring
for xmlfile in xml_files[1:]:
xslstring = xslstring + '<xsl:copy-of select="document(\'' + xmlfile[-16:] + "')/BODY/DATA/TALLYMESSAGE\"/>\n"
xslstring = xslstring + """</xsl:copy>+
</DATA>
</xsl:template>
</xsl:transform>"""
#print xslstring
with open("parsingxsl.xsl", "w") as f:
f.write(xslstring)
with open(xml_files[0], "r") as f:
dom = etree.XML(f.read())
print etree.tostring(dom)
with open('F:\data\parsingxsl.xsl', "r") as f:
xslt_tree = etree.XML(f.read())
print xslt_tree
transform = etree.XSLT(xslt_tree)
newdom = transform(dom)
#print newdom
tree_out = etree.tostring(newdom, encoding='UTF-8', pretty_print=True, xml_declaration=True)
print(tree_out)
xmlfile = open('F:\data\OutputFile.xml','wb')
xmlfile.write(tree_out)
xmlfile.close()
XSLFILE("F:\data\data")
当 运行 创建以下错误时相同:
Traceback (most recent call last):
File "F:\data\xmlmergexsl.py", line 38, in <module>
XSLFILE("F:\data\data")
File "F:\data\xmlmergexsl.py", line 36, in XSLFILE
xmlfile.write(tree_out)
TypeError: must be string or buffer, not None
考虑使用 XSLT,它的 document() function to merge XML files. Python (like many object-oriented programming languages) maintain an XSLT processor like in its lxml module. As information, XSLT 是一种声明性编程语言,可以将 XML 文件转换为各种格式和结构。
出于您的目的,XSLT 可能比使用编程代码开发文件更有效,因为除了 XSLT 处理器将使用的内容之外,在处理过程中没有列表或循环或其他对象保存在内存中。
XSLT(在外部保存为 .xsl 文件)
最初考虑运行一个Python写入文本文件循环填写所有365文档以避免复制和粘贴。另请注意第一个文档被跳过,因为它是下面 Python 脚本中使用的起点:
<?xml version="1.0" ?>
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:template match="DATA">
<DATA>
<xsl:copy>
<xsl:copy-of select="TALLYMESSAGE"/>
<xsl:copy-of select="document('Document2.xml')/BODY/DATA/TALLYMESSAGE"/>
<xsl:copy-of select="document('Document3.xml')/BODY/DATA/TALLYMESSAGE"/>
<xsl:copy-of select="document('Document4.xml')/BODY/DATA/TALLYMESSAGE"/>
...
<xsl:copy-of select="document('Document365.xml')/BODY/DATA/TALLYMESSAGE"/>
</xsl:copy>
</DATA>
</xsl:template>
</xsl:transform>
Python(将包含在您的整个脚本中)
import lxml.etree as ET
dom = ET.parse('C:\Path\To\XML\Document1.xml')
xslt = ET.parse('C:\Path\To\XSL\file.xsl')
transform = ET.XSLT(xslt)
newdom = transform(dom)
tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True, xml_declaration=True)
print(tree_out)
xmlfile = open('C:\Path\To\XML\OutputFile.xml','wb')
xmlfile.write(tree_out)
xmlfile.close()
我有很多 xml 个文件需要合并。我在 merging xml files using python's ElementTree 试过这个 link 谁的代码是(根据我的需要编辑):
import os, os.path, sys
import glob
from xml.etree import ElementTree
def run(files):
xml_files = glob.glob(files +"/*.xml")
xml_element_tree = None
for xml_file in xml_files:
print xml_file
data = ElementTree.parse(xml_file).getroot()
# print ElementTree.tostring(data)
for result in data.iter('TALLYMESSAGE'):
if xml_element_tree is None:
xml_element_tree = data
insertion_point = xml_element_tree.findall("./BODY/DATA/TALLYMESSAGE")[0]
else:
insertion_point.extend(result)
if xml_element_tree is not None:
f = open("myxmlfile.xml", "wb")
f.write(ElementTree.tostring(xml_element_tree))
run("F:/data/data")
但问题是我有很多 XML 文件,准确地说是 365 个,每个至少 2 MB。将它们全部合并导致我的电脑崩溃。
这是我的 xml 文件的 xml 树的图像:
我的新更新代码是:
import os, os.path, sys
import glob
from lxml import etree
def XSLFILE(files):
xml_files = glob.glob(files +"/*.xml")
#print xml_files[0]
xslstring = """<?xml version="1.0" ?>
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:template match="/DATA">
<DATA>
<xsl:copy>
<xsl:copy-of select="TALLYMESSAGE"/>\n"""
#print xslstring
for xmlfile in xml_files[1:]:
xslstring = xslstring + '<xsl:copy-of select="document(\'' + xmlfile[-16:] + "')/BODY/DATA/TALLYMESSAGE\"/>\n"
xslstring = xslstring + """</xsl:copy>+
</DATA>
</xsl:template>
</xsl:transform>"""
#print xslstring
with open("parsingxsl.xsl", "w") as f:
f.write(xslstring)
with open(xml_files[0], "r") as f:
dom = etree.XML(f.read())
print etree.tostring(dom)
with open('F:\data\parsingxsl.xsl', "r") as f:
xslt_tree = etree.XML(f.read())
print xslt_tree
transform = etree.XSLT(xslt_tree)
newdom = transform(dom)
#print newdom
tree_out = etree.tostring(newdom, encoding='UTF-8', pretty_print=True, xml_declaration=True)
print(tree_out)
xmlfile = open('F:\data\OutputFile.xml','wb')
xmlfile.write(tree_out)
xmlfile.close()
XSLFILE("F:\data\data")
当 运行 创建以下错误时相同:
Traceback (most recent call last):
File "F:\data\xmlmergexsl.py", line 38, in <module>
XSLFILE("F:\data\data")
File "F:\data\xmlmergexsl.py", line 36, in XSLFILE
xmlfile.write(tree_out)
TypeError: must be string or buffer, not None
考虑使用 XSLT,它的 document() function to merge XML files. Python (like many object-oriented programming languages) maintain an XSLT processor like in its lxml module. As information, XSLT 是一种声明性编程语言,可以将 XML 文件转换为各种格式和结构。
出于您的目的,XSLT 可能比使用编程代码开发文件更有效,因为除了 XSLT 处理器将使用的内容之外,在处理过程中没有列表或循环或其他对象保存在内存中。
XSLT(在外部保存为 .xsl 文件)
最初考虑运行一个Python写入文本文件循环填写所有365文档以避免复制和粘贴。另请注意第一个文档被跳过,因为它是下面 Python 脚本中使用的起点:
<?xml version="1.0" ?>
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:template match="DATA">
<DATA>
<xsl:copy>
<xsl:copy-of select="TALLYMESSAGE"/>
<xsl:copy-of select="document('Document2.xml')/BODY/DATA/TALLYMESSAGE"/>
<xsl:copy-of select="document('Document3.xml')/BODY/DATA/TALLYMESSAGE"/>
<xsl:copy-of select="document('Document4.xml')/BODY/DATA/TALLYMESSAGE"/>
...
<xsl:copy-of select="document('Document365.xml')/BODY/DATA/TALLYMESSAGE"/>
</xsl:copy>
</DATA>
</xsl:template>
</xsl:transform>
Python(将包含在您的整个脚本中)
import lxml.etree as ET
dom = ET.parse('C:\Path\To\XML\Document1.xml')
xslt = ET.parse('C:\Path\To\XSL\file.xsl')
transform = ET.XSLT(xslt)
newdom = transform(dom)
tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True, xml_declaration=True)
print(tree_out)
xmlfile = open('C:\Path\To\XML\OutputFile.xml','wb')
xmlfile.write(tree_out)
xmlfile.close()