如何使用 python 删除 xml 文件中的子元素
How To remove an Subement in xml file using python
我想删除一个特定元素及其所有子元素。
找到我想删除的元素我想使用标签的 ID 或标签的名称。
例如,给定这个 etree 对象
<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
<tag id="29">
<name>Moon</name>
</tag>
</tags>
</root>
例如我想删除 ID 为“29”的 Moon
我想要的输出:
<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
</tags>
</root>
这是我的代码:
def remove_tag(root, tag_id_r):
i = 0
for tag in root.iter('tag'):
tag_id = tag.get('id')
if (tag_id == tag_id_r):
#root.clear(tag)
#root.remove(tag)
#root[1][i].remove(tag)
# print(i, tag_id, tag_id_r, root[1][i])
i += 1
def main():
with open("lib.xml", 'a') as f:
tree = etree.parse('lib.xml')
root = tree.getroot()
remove_tag(root, input("What is the id of the tag you want to remove?"))
f.seek(0)
f.truncate()
dom = minidom.parseString(etree.tostring(tree, encoding="utf-8"))
print('\n'.join([line for line in dom.toprettyxml(indent=' '*2).split('\n') if line.strip()]), file=f)
main()
我尝试了评论中的所有方法,但它不起作用
尝试这样的事情:
elems = """<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
<tag id="29">
<name>Moon</name>
</tag>
</tags>
</root>
""" #note that the xml has been fixed
from lxml import etree
doc = etree.XML(elems)
to_del = doc.xpath('//name["Moon"]/parent::tag[@id="29"]')
for td in to_del:
td.getparent().remove(td)
print(etree.tostring(doc, pretty_print=True, xml_declaration=True).decode())
输出:
<?xml version='1.0' encoding='ASCII'?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
</tags>
</root>
要删除 ElementTree 中的元素(这是问题的标记,但未显示导入),您必须首先获取父元素(在本例中为 tags
)。 (lxml 具有 Jack Fleeting 的回答中显示的 .getparent()
方法。)
此外,如果您真的想覆盖它,则不必打开文件并截断它;只需使用 ElementTree 对象的 .write() 方法即可。
示例...
XML 输入(lib.xml;添加“</tags>
”使其成为well-formed)
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
<tag id="29">
<name>Moon</name>
</tag>
</tags>
</root>
Python
import xml.etree.ElementTree as etree
def remove_tag(root, tag_id_r):
tags_elem = root.find("tags")
target_tag = tags_elem.find(f"tag[@id='{tag_id_r}']")
if target_tag:
tags_elem.remove(target_tag)
else:
print(f"A tag with the id \"{tag_id_r}\" cannot be found.")
def main():
tree = etree.parse("lib.xml")
root = tree.getroot()
remove_tag(root, input("What is the id of the tag you want to remove? "))
# Overwriting the input file. Are you sure that's a good idea?
tree.write("lib.xml", encoding="utf-8")
main()
XML 输出(已更新lib.xml)
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
</tags>
</root>
另一种方法。
from simplified_scrapy import SimplifiedDoc
html = """
<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
<tag id="29">
<name>Moon</name>
</tag>
</root>
"""
doc = SimplifiedDoc(html)
tag29 = doc.select('tag#29')
# Or
# tag29 = doc.getElementByText('Moon',tag='tag')
tag29.remove()
print (doc.html)
结果:
<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
</root>
我想删除一个特定元素及其所有子元素。 找到我想删除的元素我想使用标签的 ID 或标签的名称。
例如,给定这个 etree 对象
<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
<tag id="29">
<name>Moon</name>
</tag>
</tags>
</root>
例如我想删除 ID 为“29”的 Moon
我想要的输出:
<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
</tags>
</root>
这是我的代码:
def remove_tag(root, tag_id_r):
i = 0
for tag in root.iter('tag'):
tag_id = tag.get('id')
if (tag_id == tag_id_r):
#root.clear(tag)
#root.remove(tag)
#root[1][i].remove(tag)
# print(i, tag_id, tag_id_r, root[1][i])
i += 1
def main():
with open("lib.xml", 'a') as f:
tree = etree.parse('lib.xml')
root = tree.getroot()
remove_tag(root, input("What is the id of the tag you want to remove?"))
f.seek(0)
f.truncate()
dom = minidom.parseString(etree.tostring(tree, encoding="utf-8"))
print('\n'.join([line for line in dom.toprettyxml(indent=' '*2).split('\n') if line.strip()]), file=f)
main()
我尝试了评论中的所有方法,但它不起作用
尝试这样的事情:
elems = """<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
<tag id="29">
<name>Moon</name>
</tag>
</tags>
</root>
""" #note that the xml has been fixed
from lxml import etree
doc = etree.XML(elems)
to_del = doc.xpath('//name["Moon"]/parent::tag[@id="29"]')
for td in to_del:
td.getparent().remove(td)
print(etree.tostring(doc, pretty_print=True, xml_declaration=True).decode())
输出:
<?xml version='1.0' encoding='ASCII'?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
</tags>
</root>
要删除 ElementTree 中的元素(这是问题的标记,但未显示导入),您必须首先获取父元素(在本例中为 tags
)。 (lxml 具有 Jack Fleeting 的回答中显示的 .getparent()
方法。)
此外,如果您真的想覆盖它,则不必打开文件并截断它;只需使用 ElementTree 对象的 .write() 方法即可。
示例...
XML 输入(lib.xml;添加“</tags>
”使其成为well-formed)
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
<tag id="29">
<name>Moon</name>
</tag>
</tags>
</root>
Python
import xml.etree.ElementTree as etree
def remove_tag(root, tag_id_r):
tags_elem = root.find("tags")
target_tag = tags_elem.find(f"tag[@id='{tag_id_r}']")
if target_tag:
tags_elem.remove(target_tag)
else:
print(f"A tag with the id \"{tag_id_r}\" cannot be found.")
def main():
tree = etree.parse("lib.xml")
root = tree.getroot()
remove_tag(root, input("What is the id of the tag you want to remove? "))
# Overwriting the input file. Are you sure that's a good idea?
tree.write("lib.xml", encoding="utf-8")
main()
XML 输出(已更新lib.xml)
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
</tags>
</root>
另一种方法。
from simplified_scrapy import SimplifiedDoc
html = """
<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
<tag id="29">
<name>Moon</name>
</tag>
</root>
"""
doc = SimplifiedDoc(html)
tag29 = doc.select('tag#29')
# Or
# tag29 = doc.getElementByText('Moon',tag='tag')
tag29.remove()
print (doc.html)
结果:
<?xml version="1.0" ?>
<root>
<tag_folders>
<folder id="1">Stars</folder>
<folder id="2">Planet</folder>
<folder id="3">Satellite</folder>
</tag_folders>
<tags>
<tag>
<name>Earth</name>
</tag>
<tag id="2">
<name>Sun</name>
</tag>
</root>