使用 ElementTree 递归 XML 解析 python
Recursive XML parsing python using ElementTree
我正在尝试使用 Python ElementTree 解析以下 XML 以生成如下输出。我正在尝试为顶级元素编写模块来打印它们。然而,这有点棘手,因为类别元素可能有也可能没有 属性,而类别元素内部可能有类别元素。
我在本主题中提到了上一个问题,但它们不包含同名的嵌套元素
我的代码:
http://pastebin.com/Fsv2Xzqf
work.xml:
<suite id="1" name="MainApplication">
<displayNameKey>my Application</displayNameKey>
<displayName>my Application</displayName>
<application id="2" name="Sub Application1">
<displayNameKey>sub Application1</displayNameKey>
<displayName>sub Application1</displayName>
<category id="2423" name="about">
<displayNameKey>subApp.about</displayNameKey>
<displayName>subApp.about</displayName>
<category id="2423" name="comms">
<displayNameKey>subApp.comms</displayNameKey>
<displayName>subApp.comms</displayName>
<property id="5909" name="copyright" type="string_property" width="40">
<value>2014</value>
</property>
<property id="5910" name="os" type="string_property" width="40">
<value>Linux 2.6.32-431.29.2.el6.x86_64</value>
</property>
</category>
<property id="5908" name="releaseNumber" type="string_property" width="40">
<value>9.1.0.3.0.54</value>
</property>
</category>
</application>
</suite>
输出应如下所示:
Suite: MainApplication
Application: Sub Application1
Category: about
property: releaseNumber | 9.1.0.3.0.54
category: comms
property: copyright | 2014
property: os | Linux 2.6.32-431.29.2.el6.x86_64
任何正确方向的指示都会有所帮助。
import xml.etree.ElementTree as ET
tree = ET.ElementTree(file='work.xml')
indent = 0
ignoreElems = ['displayNameKey', 'displayName']
def printRecur(root):
"""Recursively prints the tree."""
if root.tag in ignoreElems:
return
print ' '*indent + '%s: %s' % (root.tag.title(), root.attrib.get('name', root.text))
global indent
indent += 4
for elem in root.getchildren():
printRecur(elem)
indent -= 4
root = tree.getroot()
printRecur(root)
输出:
Suite: MainApplication
Application: Sub Application1
Category: about
Category: comms
Property: copyright
Value: 2014
Property: os
Value: Linux 2.6.32-431.29.2.el6.x86_64
Property: releaseNumber
Value: 9.1.0.3.0.54
这是我能在 5 分钟内到达的最接近的位置。您应该 递归地调用一个处理器函数 就可以了。您可以从这一点开始改进:)
您还可以为每个标签定义处理函数,并将它们全部放入字典中以便于查找。然后你可以检查你是否有一个适合该标签的处理函数,然后调用它,否则继续盲目打印。例如:
HANDLERS = {
'property': 'handle_property',
<tag_name>: <handler_function>
}
def handle_property(root):
"""Takes property root element and prints the values."""
data = ' '*indent + '%s: %s ' % (root.tag.title(), root.attrib['name'])
values = []
for elem in root.getchildren():
if elem.tag == 'value':
values.append(elem.text)
print data + '| %s' % (', '.join(values))
# printRecur would get modified accordingly.
def printRecur(root):
"""Recursively prints the tree."""
if root.tag in ignoreElems:
return
global indent
indent += 4
if root.tag in HANDLERS:
handler = globals()[HANDLERS[root.tag]]
handler(root)
else:
print ' '*indent + '%s: %s' % (root.tag.title(), root.attrib.get('name', root.text))
for elem in root.getchildren():
printRecur(elem)
indent -= 4
上面的输出:
Suite: MainApplication
Application: Sub Application1
Category: about
Category: comms
Property: copyright | 2014
Property: os | Linux 2.6.32-431.29.2.el6.x86_64
Property: releaseNumber | 9.1.0.3.0.54
我发现这非常有用,而不是在代码中放入大量 if/else。
如果您想要一种通用的 xml 导入器,请为每个 xml 元素创建一条记录
import pandas as pd
import xml.etree.ElementTree as ET
tree = ET.parse('file.xml')
root = tree.getroot()
def rij(elem,level,tags,rtag,mtag,keys,rootkey,data):
otag=mtag
mtag=elem.tag
mtag=mtag[mtag.rfind('}')+1:]
tags.append(mtag)
if level==1:
rtag=mtag
if elem.keys() is not None:
mkey=[]
if len(elem.keys())>1:
for key in elem.keys():
mkey.append(elem.attrib.get(key))
rootkey=mkey
else:
for key in elem.keys():
rootkey=elem.attrib.get(key)
else:
if elem.keys() is not None:
mkey=[]
lkey=[]
for key in elem.keys():
if len(elem.keys())>1:
mkey.append(elem.attrib.get(key))
keys=mkey
else:
for key in elem.keys():
keys=elem.attrib.get(key)
lkey=key
if elem.text is not None:
if elem.text!='\n ':
data.append([rootkey,tags,rtag,otag,mtag,lkey,keys,elem.text])
else:
data.append([rootkey,tags,rtag,otag,mtag,lkey,keys,''])
#print(data)
level+=1
for chil in elem.getchildren():
data = rij(chil, level,tags,rtag,mtag, keys,rootkey,data)
level-=1
mtag=elem.tag
mtag=mtag[mtag.rfind('}')+1:]
tags.remove(mtag)
return data
data = rij(root,0,[],'','', [],[],[])
如果你想要一个准系统 XML 递归树解析器片段:
from xml.etree import ElementTree
tree = ElementTree.parse('english_saheeh.xml')
root = tree.getroot()
def walk_tree_recursive(root):
#do whatever with .tags here
for child in root:
walk_tree_recursive(child)
walk_tree_recursive(root)
我正在尝试使用 Python ElementTree 解析以下 XML 以生成如下输出。我正在尝试为顶级元素编写模块来打印它们。然而,这有点棘手,因为类别元素可能有也可能没有 属性,而类别元素内部可能有类别元素。
我在本主题中提到了上一个问题,但它们不包含同名的嵌套元素
我的代码: http://pastebin.com/Fsv2Xzqf
work.xml:
<suite id="1" name="MainApplication">
<displayNameKey>my Application</displayNameKey>
<displayName>my Application</displayName>
<application id="2" name="Sub Application1">
<displayNameKey>sub Application1</displayNameKey>
<displayName>sub Application1</displayName>
<category id="2423" name="about">
<displayNameKey>subApp.about</displayNameKey>
<displayName>subApp.about</displayName>
<category id="2423" name="comms">
<displayNameKey>subApp.comms</displayNameKey>
<displayName>subApp.comms</displayName>
<property id="5909" name="copyright" type="string_property" width="40">
<value>2014</value>
</property>
<property id="5910" name="os" type="string_property" width="40">
<value>Linux 2.6.32-431.29.2.el6.x86_64</value>
</property>
</category>
<property id="5908" name="releaseNumber" type="string_property" width="40">
<value>9.1.0.3.0.54</value>
</property>
</category>
</application>
</suite>
输出应如下所示:
Suite: MainApplication
Application: Sub Application1
Category: about
property: releaseNumber | 9.1.0.3.0.54
category: comms
property: copyright | 2014
property: os | Linux 2.6.32-431.29.2.el6.x86_64
任何正确方向的指示都会有所帮助。
import xml.etree.ElementTree as ET
tree = ET.ElementTree(file='work.xml')
indent = 0
ignoreElems = ['displayNameKey', 'displayName']
def printRecur(root):
"""Recursively prints the tree."""
if root.tag in ignoreElems:
return
print ' '*indent + '%s: %s' % (root.tag.title(), root.attrib.get('name', root.text))
global indent
indent += 4
for elem in root.getchildren():
printRecur(elem)
indent -= 4
root = tree.getroot()
printRecur(root)
输出:
Suite: MainApplication
Application: Sub Application1
Category: about
Category: comms
Property: copyright
Value: 2014
Property: os
Value: Linux 2.6.32-431.29.2.el6.x86_64
Property: releaseNumber
Value: 9.1.0.3.0.54
这是我能在 5 分钟内到达的最接近的位置。您应该 递归地调用一个处理器函数 就可以了。您可以从这一点开始改进:)
您还可以为每个标签定义处理函数,并将它们全部放入字典中以便于查找。然后你可以检查你是否有一个适合该标签的处理函数,然后调用它,否则继续盲目打印。例如:
HANDLERS = {
'property': 'handle_property',
<tag_name>: <handler_function>
}
def handle_property(root):
"""Takes property root element and prints the values."""
data = ' '*indent + '%s: %s ' % (root.tag.title(), root.attrib['name'])
values = []
for elem in root.getchildren():
if elem.tag == 'value':
values.append(elem.text)
print data + '| %s' % (', '.join(values))
# printRecur would get modified accordingly.
def printRecur(root):
"""Recursively prints the tree."""
if root.tag in ignoreElems:
return
global indent
indent += 4
if root.tag in HANDLERS:
handler = globals()[HANDLERS[root.tag]]
handler(root)
else:
print ' '*indent + '%s: %s' % (root.tag.title(), root.attrib.get('name', root.text))
for elem in root.getchildren():
printRecur(elem)
indent -= 4
上面的输出:
Suite: MainApplication
Application: Sub Application1
Category: about
Category: comms
Property: copyright | 2014
Property: os | Linux 2.6.32-431.29.2.el6.x86_64
Property: releaseNumber | 9.1.0.3.0.54
我发现这非常有用,而不是在代码中放入大量 if/else。
如果您想要一种通用的 xml 导入器,请为每个 xml 元素创建一条记录
import pandas as pd
import xml.etree.ElementTree as ET
tree = ET.parse('file.xml')
root = tree.getroot()
def rij(elem,level,tags,rtag,mtag,keys,rootkey,data):
otag=mtag
mtag=elem.tag
mtag=mtag[mtag.rfind('}')+1:]
tags.append(mtag)
if level==1:
rtag=mtag
if elem.keys() is not None:
mkey=[]
if len(elem.keys())>1:
for key in elem.keys():
mkey.append(elem.attrib.get(key))
rootkey=mkey
else:
for key in elem.keys():
rootkey=elem.attrib.get(key)
else:
if elem.keys() is not None:
mkey=[]
lkey=[]
for key in elem.keys():
if len(elem.keys())>1:
mkey.append(elem.attrib.get(key))
keys=mkey
else:
for key in elem.keys():
keys=elem.attrib.get(key)
lkey=key
if elem.text is not None:
if elem.text!='\n ':
data.append([rootkey,tags,rtag,otag,mtag,lkey,keys,elem.text])
else:
data.append([rootkey,tags,rtag,otag,mtag,lkey,keys,''])
#print(data)
level+=1
for chil in elem.getchildren():
data = rij(chil, level,tags,rtag,mtag, keys,rootkey,data)
level-=1
mtag=elem.tag
mtag=mtag[mtag.rfind('}')+1:]
tags.remove(mtag)
return data
data = rij(root,0,[],'','', [],[],[])
如果你想要一个准系统 XML 递归树解析器片段:
from xml.etree import ElementTree
tree = ElementTree.parse('english_saheeh.xml')
root = tree.getroot()
def walk_tree_recursive(root):
#do whatever with .tags here
for child in root:
walk_tree_recursive(child)
walk_tree_recursive(root)