把ElementTree直接写成utf-8编码的zip
Write ElementTree directly to zip with utf-8 encoding
我要修改大量XML。它们存储在 ZIP 文件中。 source-XMLs 是 utf-8 编码的(至少根据 Linux 上的 file
工具的猜测)并且有一个正确的 XML 声明:
<?xml version='1.0' encoding='UTF-8'?>
.
目标 ZIP 和其中包含的 XML 也应该有正确的 XML 声明。然而,(至少对我而言)最明显的方法(使用 ElementTree.tostring
)失败了。
这是一个独立的示例,应该开箱即用。
简短演练:
- 进口
- 准备(创建 src.zip,这些 ZIP 是我实际应用中给出的)
- 程序的实际工作(修改 XMLs),从
# read XMLs from zip
开始
请重点关注下半部分,尤其是# APPROACH 1
、APPROACH 2
、APPROACH 3
:
import os
import tempfile
import zipfile
from xml.etree.ElementTree import Element, parse
src_1 = os.path.join(tempfile.gettempdir(), "one.xml")
src_2 = os.path.join(tempfile.gettempdir(), "two.xml")
src_zip = os.path.join(tempfile.gettempdir(), "src.zip")
trgt_appr1_zip = os.path.join(tempfile.gettempdir(), "trgt_appr1.zip")
trgt_appr2_zip = os.path.join(tempfile.gettempdir(), "trgt_appr2.zip")
trgt_appr3_zip = os.path.join(tempfile.gettempdir(), "trgt_appr3.zip")
# file on hard disk that must be used due to ElementTree insufficiencies
tmp_xml_name = os.path.join(tempfile.gettempdir(), "curr_xml.tmp")
# prepare src.zip
tree1 = ElementTree(Element('hello', {'beer': 'good'}))
tree1.write(os.path.join(tempfile.gettempdir(), "one.xml"), encoding="UTF-8", xml_declaration=True)
tree2 = ElementTree(Element('scnd', {'äkey': 'a value'}))
tree2.write(os.path.join(tempfile.gettempdir(), "two.xml"), encoding="UTF-8", xml_declaration=True)
with zipfile.ZipFile(src_zip, 'a') as src:
with open(src_1, 'r', encoding="utf-8") as one:
string_representation = one.read()
# write to zip
src.writestr(zinfo_or_arcname="one.xml", data=string_representation.encode("utf-8"))
with open(src_2, 'r', encoding="utf-8") as two:
string_representation = two.read()
# write to zip
src.writestr(zinfo_or_arcname="two.xml", data=string_representation.encode("utf-8"))
os.remove(src_1)
os.remove(src_2)
# read XMLs from zip
with zipfile.ZipFile(src_zip, 'r') as zfile:
updated_trees = []
for xml_name in zfile.namelist():
curr_file = zfile.open(xml_name, 'r')
tree = parse(curr_file)
# modify tree
updated_tree = tree
updated_tree.getroot().append(Element('new', {'newkey': 'new value'}))
updated_trees.append((xml_name, updated_tree))
for xml_name, updated_tree in updated_trees:
# write to target file
with zipfile.ZipFile(trgt_appr1_zip, 'a') as trgt1_zip, zipfile.ZipFile(trgt_appr2_zip, 'a') as trgt2_zip, zipfile.ZipFile(trgt_appr3_zip, 'a') as trgt3_zip:
#
# APPROACH 1 [DESIRED, BUT DOES NOT WORK]: write tree to zip-file
# encoding in XML declaration missing
#
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='UTF-8', method='xml')
# write XML directly to zip
trgt1_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)
#
# APPROACH 2 [WORKS IN THEORY, BUT DOES NOT WORK]: write tree to zip-file
# encoding in XML declaration is faulty (is 'utf8', should be 'utf-8' or 'UTF-8')
#
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='utf8', method='xml')
# write XML directly to zip
trgt2_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)
#
# APPROACH 3 [WORKS, BUT LACKS PERFORMANCE]: write to file, then read from file, then write to zip
#
# write to file
updated_tree.write(tmp_xml_name, encoding="UTF-8", method="xml", xml_declaration=True)
# read from file
with open(tmp_xml_name, 'r', encoding="utf-8") as tmp:
string_representation = tmp.read()
# write to zip
trgt3_zip.writestr(zinfo_or_arcname=xml_name, data=string_representation.encode("utf-8"))
os.remove(tmp_xml_name)
APPROACH 3
有效,但它比其他两个更占用资源。
APPROACH 2
是我可以使用实际的 XML 声明编写 ElementTree 对象的唯一方法——结果证明该声明无效(utf8
而不是 UTF-8
/utf-8
).
APPROACH 1
是最理想的——但在稍后的管道读取过程中失败,因为缺少 XML 声明。
问题:如何摆脱先将整个XML写入磁盘,然后再读取,写入zip并删除完成 zip 了吗?我错过了什么?
方法一中唯一真正缺少的是 XML 声明 header。对于 ElementTree.write(...)
,您可以使用 xml_declaration,不幸的是,对于您的版本,ElementTree.tostring
尚不可用。
从 Python 3.8 开始,ElementTree.tostring 方法确实有一个 xml_declaration 参数,请参阅:
https://docs.python.org/3.8/library/xml.etree.elementtree.html
即使您在使用 Python 3.6 时无法使用该实现,您也可以轻松地将 3.8 实现复制到您自己的 Python 文件中:
import io
def tostring(element, encoding=None, method=None, *,
xml_declaration=None, default_namespace=None,
short_empty_elements=True):
"""Generate string representation of XML element.
All subelements are included. If encoding is "unicode", a string
is returned. Otherwise a bytestring is returned.
*element* is an Element instance, *encoding* is an optional output
encoding defaulting to US-ASCII, *method* is an optional output which can
be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
sets the default XML namespace (for "xmlns").
Returns an (optionally) encoded string containing the XML data.
"""
stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
ElementTree(element).write(stream, encoding,
xml_declaration=xml_declaration,
default_namespace=default_namespace,
method=method,
short_empty_elements=short_empty_elements)
return stream.getvalue()
(见https://github.com/python/cpython/blob/v3.8.0/Lib/xml/etree/ElementTree.py#L1116)
在这种情况下,您可以简单地使用方法一:
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='UTF-8', method='xml', xml_declaration=True)
# write XML directly to zip
trgt1_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)
您可以使用 io.BytesIO
对象。
这允许使用 ElementTree.write
,同时避免将树导出到磁盘:
import zipfile
from io import BytesIO
from xml.etree.ElementTree import ElementTree, Element
tree = ElementTree(Element('hello', {'beer': 'good'}))
bio = BytesIO()
tree.write(bio, encoding='UTF-8', xml_declaration=True)
with zipfile.ZipFile('/tmp/test.zip', 'w') as z:
z.writestr('test.xml', bio.getvalue())
如果您使用的是 Python 3.6 或更高版本,则有一个更短的解决方案:
您可以从 ZipFile
对象中获取可写文件对象,您可以将其传递给 ElementTree.write
:
import zipfile
from xml.etree.ElementTree import ElementTree, Element
tree = ElementTree(Element('hello', {'beer': 'good'}))
with zipfile.ZipFile('/tmp/test.zip', 'w') as z:
with z.open('test.xml', 'w') as f:
tree.write(f, encoding='UTF-8', xml_declaration=True)
这还有一个好处,就是您不会在内存中存储树的多个副本,这对于大型树来说可能是一个相关问题。
我要修改大量XML。它们存储在 ZIP 文件中。 source-XMLs 是 utf-8 编码的(至少根据 Linux 上的 file
工具的猜测)并且有一个正确的 XML 声明:
<?xml version='1.0' encoding='UTF-8'?>
.
目标 ZIP 和其中包含的 XML 也应该有正确的 XML 声明。然而,(至少对我而言)最明显的方法(使用 ElementTree.tostring
)失败了。
这是一个独立的示例,应该开箱即用。 简短演练:
- 进口
- 准备(创建 src.zip,这些 ZIP 是我实际应用中给出的)
- 程序的实际工作(修改 XMLs),从
# read XMLs from zip
开始
请重点关注下半部分,尤其是# APPROACH 1
、APPROACH 2
、APPROACH 3
:
import os
import tempfile
import zipfile
from xml.etree.ElementTree import Element, parse
src_1 = os.path.join(tempfile.gettempdir(), "one.xml")
src_2 = os.path.join(tempfile.gettempdir(), "two.xml")
src_zip = os.path.join(tempfile.gettempdir(), "src.zip")
trgt_appr1_zip = os.path.join(tempfile.gettempdir(), "trgt_appr1.zip")
trgt_appr2_zip = os.path.join(tempfile.gettempdir(), "trgt_appr2.zip")
trgt_appr3_zip = os.path.join(tempfile.gettempdir(), "trgt_appr3.zip")
# file on hard disk that must be used due to ElementTree insufficiencies
tmp_xml_name = os.path.join(tempfile.gettempdir(), "curr_xml.tmp")
# prepare src.zip
tree1 = ElementTree(Element('hello', {'beer': 'good'}))
tree1.write(os.path.join(tempfile.gettempdir(), "one.xml"), encoding="UTF-8", xml_declaration=True)
tree2 = ElementTree(Element('scnd', {'äkey': 'a value'}))
tree2.write(os.path.join(tempfile.gettempdir(), "two.xml"), encoding="UTF-8", xml_declaration=True)
with zipfile.ZipFile(src_zip, 'a') as src:
with open(src_1, 'r', encoding="utf-8") as one:
string_representation = one.read()
# write to zip
src.writestr(zinfo_or_arcname="one.xml", data=string_representation.encode("utf-8"))
with open(src_2, 'r', encoding="utf-8") as two:
string_representation = two.read()
# write to zip
src.writestr(zinfo_or_arcname="two.xml", data=string_representation.encode("utf-8"))
os.remove(src_1)
os.remove(src_2)
# read XMLs from zip
with zipfile.ZipFile(src_zip, 'r') as zfile:
updated_trees = []
for xml_name in zfile.namelist():
curr_file = zfile.open(xml_name, 'r')
tree = parse(curr_file)
# modify tree
updated_tree = tree
updated_tree.getroot().append(Element('new', {'newkey': 'new value'}))
updated_trees.append((xml_name, updated_tree))
for xml_name, updated_tree in updated_trees:
# write to target file
with zipfile.ZipFile(trgt_appr1_zip, 'a') as trgt1_zip, zipfile.ZipFile(trgt_appr2_zip, 'a') as trgt2_zip, zipfile.ZipFile(trgt_appr3_zip, 'a') as trgt3_zip:
#
# APPROACH 1 [DESIRED, BUT DOES NOT WORK]: write tree to zip-file
# encoding in XML declaration missing
#
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='UTF-8', method='xml')
# write XML directly to zip
trgt1_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)
#
# APPROACH 2 [WORKS IN THEORY, BUT DOES NOT WORK]: write tree to zip-file
# encoding in XML declaration is faulty (is 'utf8', should be 'utf-8' or 'UTF-8')
#
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='utf8', method='xml')
# write XML directly to zip
trgt2_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)
#
# APPROACH 3 [WORKS, BUT LACKS PERFORMANCE]: write to file, then read from file, then write to zip
#
# write to file
updated_tree.write(tmp_xml_name, encoding="UTF-8", method="xml", xml_declaration=True)
# read from file
with open(tmp_xml_name, 'r', encoding="utf-8") as tmp:
string_representation = tmp.read()
# write to zip
trgt3_zip.writestr(zinfo_or_arcname=xml_name, data=string_representation.encode("utf-8"))
os.remove(tmp_xml_name)
APPROACH 3
有效,但它比其他两个更占用资源。
APPROACH 2
是我可以使用实际的 XML 声明编写 ElementTree 对象的唯一方法——结果证明该声明无效(utf8
而不是 UTF-8
/utf-8
).
APPROACH 1
是最理想的——但在稍后的管道读取过程中失败,因为缺少 XML 声明。
问题:如何摆脱先将整个XML写入磁盘,然后再读取,写入zip并删除完成 zip 了吗?我错过了什么?
方法一中唯一真正缺少的是 XML 声明 header。对于 ElementTree.write(...)
,您可以使用 xml_declaration,不幸的是,对于您的版本,ElementTree.tostring
尚不可用。
从 Python 3.8 开始,ElementTree.tostring 方法确实有一个 xml_declaration 参数,请参阅: https://docs.python.org/3.8/library/xml.etree.elementtree.html
即使您在使用 Python 3.6 时无法使用该实现,您也可以轻松地将 3.8 实现复制到您自己的 Python 文件中:
import io
def tostring(element, encoding=None, method=None, *,
xml_declaration=None, default_namespace=None,
short_empty_elements=True):
"""Generate string representation of XML element.
All subelements are included. If encoding is "unicode", a string
is returned. Otherwise a bytestring is returned.
*element* is an Element instance, *encoding* is an optional output
encoding defaulting to US-ASCII, *method* is an optional output which can
be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
sets the default XML namespace (for "xmlns").
Returns an (optionally) encoded string containing the XML data.
"""
stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
ElementTree(element).write(stream, encoding,
xml_declaration=xml_declaration,
default_namespace=default_namespace,
method=method,
short_empty_elements=short_empty_elements)
return stream.getvalue()
(见https://github.com/python/cpython/blob/v3.8.0/Lib/xml/etree/ElementTree.py#L1116)
在这种情况下,您可以简单地使用方法一:
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='UTF-8', method='xml', xml_declaration=True)
# write XML directly to zip
trgt1_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)
您可以使用 io.BytesIO
对象。
这允许使用 ElementTree.write
,同时避免将树导出到磁盘:
import zipfile
from io import BytesIO
from xml.etree.ElementTree import ElementTree, Element
tree = ElementTree(Element('hello', {'beer': 'good'}))
bio = BytesIO()
tree.write(bio, encoding='UTF-8', xml_declaration=True)
with zipfile.ZipFile('/tmp/test.zip', 'w') as z:
z.writestr('test.xml', bio.getvalue())
如果您使用的是 Python 3.6 或更高版本,则有一个更短的解决方案:
您可以从 ZipFile
对象中获取可写文件对象,您可以将其传递给 ElementTree.write
:
import zipfile
from xml.etree.ElementTree import ElementTree, Element
tree = ElementTree(Element('hello', {'beer': 'good'}))
with zipfile.ZipFile('/tmp/test.zip', 'w') as z:
with z.open('test.xml', 'w') as f:
tree.write(f, encoding='UTF-8', xml_declaration=True)
这还有一个好处,就是您不会在内存中存储树的多个副本,这对于大型树来说可能是一个相关问题。