如何使用 python 删除 xml 文件中的子元素

How To remove an Subement in xml file using python

我想删除一个特定元素及其所有子元素。 找到我想删除的元素我想使用标签的 ID 或标签的名称。

例如,给定这个 etree 对象

<?xml version="1.0" ?>
<root>
  <tag_folders>
    <folder id="1">Stars</folder>
    <folder id="2">Planet</folder>
    <folder id="3">Satellite</folder>
  </tag_folders>
  <tags>
    <tag>
      <name>Earth</name>
    </tag>
    <tag id="2">
      <name>Sun</name>
    </tag>
    <tag id="29">
      <name>Moon</name>
    </tag>
</tags>
</root>

例如我想删除 ID 为“29”的 Moon

我想要的输出:

<?xml version="1.0" ?>
<root>
  <tag_folders>
    <folder id="1">Stars</folder>
    <folder id="2">Planet</folder>
    <folder id="3">Satellite</folder>
  </tag_folders>
  <tags>
    <tag>
      <name>Earth</name>
    </tag>
    <tag id="2">
      <name>Sun</name>
    </tag>
 </tags>

</root>

这是我的代码:

def remove_tag(root, tag_id_r):
    i = 0
    for tag in root.iter('tag'):
        tag_id = tag.get('id')
        if (tag_id == tag_id_r):
            #root.clear(tag)
            #root.remove(tag)
            #root[1][i].remove(tag)
        # print(i, tag_id, tag_id_r, root[1][i])
        i += 1

def main():
    with open("lib.xml", 'a') as f:
        tree = etree.parse('lib.xml')
        root = tree.getroot()

        remove_tag(root, input("What is the id of the tag you want to remove?"))

        f.seek(0)
        f.truncate()

        dom = minidom.parseString(etree.tostring(tree, encoding="utf-8"))
        print('\n'.join([line for line in dom.toprettyxml(indent=' '*2).split('\n') if line.strip()]), file=f)
main()

我尝试了评论中的所有方法,但它不起作用

尝试这样的事情:

elems = """<?xml version="1.0" ?>
<root>
  <tag_folders>
    <folder id="1">Stars</folder>
    <folder id="2">Planet</folder>
    <folder id="3">Satellite</folder>
  </tag_folders>
  <tags>
    <tag>
      <name>Earth</name>
    </tag>
    <tag id="2">
      <name>Sun</name>
    </tag>
    <tag id="29">
      <name>Moon</name>
    </tag>
   </tags>
</root>
""" #note that the xml has been fixed

from lxml import etree
doc = etree.XML(elems)
to_del = doc.xpath('//name["Moon"]/parent::tag[@id="29"]')
for td in to_del:
    td.getparent().remove(td)    
print(etree.tostring(doc, pretty_print=True, xml_declaration=True).decode())

输出:

<?xml version='1.0' encoding='ASCII'?>
<root>
  <tag_folders>
    <folder id="1">Stars</folder>
    <folder id="2">Planet</folder>
    <folder id="3">Satellite</folder>
  </tag_folders>
  <tags>
    <tag>
      <name>Earth</name>
    </tag>
    <tag id="2">
      <name>Sun</name>
    </tag>
    </tags>
</root>

要删除 ElementTree 中的元素(这是问题的标记,但未显示导入),您必须首先获取父元素(在本例中为 tags)。 (lxml 具有 Jack Fleeting 的回答中显示的 .getparent() 方法。)

此外,如果您真的想覆盖它,则不必打开文件并截断​​它;只需使用 ElementTree 对象的 .write() 方法即可。

示例...

XML 输入(lib.xml;添加“</tags>”使其成为well-formed)

<root>
  <tag_folders>
    <folder id="1">Stars</folder>
    <folder id="2">Planet</folder>
    <folder id="3">Satellite</folder>
  </tag_folders>
  <tags>
    <tag>
      <name>Earth</name>
    </tag>
    <tag id="2">
      <name>Sun</name>
    </tag>
    <tag id="29">
      <name>Moon</name>
    </tag>
  </tags>
</root>

Python

import xml.etree.ElementTree as etree


def remove_tag(root, tag_id_r):
    tags_elem = root.find("tags")
    target_tag = tags_elem.find(f"tag[@id='{tag_id_r}']")
    if target_tag:
        tags_elem.remove(target_tag)
    else:
        print(f"A tag with the id \"{tag_id_r}\" cannot be found.")


def main():
    tree = etree.parse("lib.xml")
    root = tree.getroot()

    remove_tag(root, input("What is the id of the tag you want to remove? "))

    # Overwriting the input file. Are you sure that's a good idea?
    tree.write("lib.xml", encoding="utf-8")


main()

XML 输出(已更新lib.xml)

<root>
  <tag_folders>
    <folder id="1">Stars</folder>
    <folder id="2">Planet</folder>
    <folder id="3">Satellite</folder>
  </tag_folders>
  <tags>
    <tag>
      <name>Earth</name>
    </tag>
    <tag id="2">
      <name>Sun</name>
    </tag>
    </tags>
</root>

另一种方法。

from simplified_scrapy import SimplifiedDoc
html = """
<?xml version="1.0" ?>
<root>
  <tag_folders>
    <folder id="1">Stars</folder>
    <folder id="2">Planet</folder>
    <folder id="3">Satellite</folder>
  </tag_folders>
  <tags>
    <tag>
      <name>Earth</name>
    </tag>
    <tag id="2">
      <name>Sun</name>
    </tag>
    <tag id="29">
      <name>Moon</name>
    </tag>
</root>
"""
doc = SimplifiedDoc(html)
tag29 = doc.select('tag#29')
# Or
# tag29 = doc.getElementByText('Moon',tag='tag')
tag29.remove()
print (doc.html)

结果:

<?xml version="1.0" ?>
<root>
  <tag_folders>
    <folder id="1">Stars</folder>
    <folder id="2">Planet</folder>
    <folder id="3">Satellite</folder>
  </tag_folders>
  <tags>
    <tag>
      <name>Earth</name>
    </tag>
    <tag id="2">
      <name>Sun</name>
    </tag>
</root>