在 Python 中使用递归函数将嵌套的 xml 标签转换为字典列表时处理重复的 xml 标签
Handling duplicate xml tags while converting nested xml tags to list of dictionaries using recursive function in Python
我有以下 XML 字符串格式的数据,它使用 python 的 lxml
包,我将其解析为 XML.
更新:我已经更新了代码和输出
现在,我必须遍历这个XML数据:
<A xmlns="dfjdlfkdjflsd">
<B>
<B1>B_1</B1>
<B2>B_2</B2>
<B3>
<B31>B3_1</B31>
<B32>B3_2</B32>
<B33>
<B331>
<B3311></B3311>
</B331>
<B332>
<B3321></B3321>
</B332>
</B33>
<B34>
<B341>
<B3411></B3411>
</B341>
<B342>
<B3421></B3421>
</B342>
</B34>
<B35>
<B351>B35_1</B351>
<B352>
<B3521>B352_1</B3521>
<B3522>B352_2</B3522>
<B3523>B352_3</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
<B352>
<B3521>B352_4</B3521>
<B3522>B352_5</B3522>
<B3523>B352_6</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
<B352>
<B3521>B352_7</B3521>
<B3522>B352_8</B3522>
<B3523>B352_9</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
</B35>
<B36>
<B361>B36_1</B361>
<B362>B36_2</B362>
</B36>
</B3>
</B>
<C>
<C1>B_1</C1>
<C2>B_2</C2>
<C3>
<C31>C3_1</C31>
<C32>C3_2</C32>
<C33>
<C331>
<C3311></C3311>
</C331>
<C332>
<C3321></C3321>
</C332>
</C33>
</C3>
</C>
</A>
并生成特定格式的输出,如下所示:
[{'B1': 'B_1',
'B2': 'B_2',
'B3_B31': 'B3_1',
'B3_B32': 'B3_2',
'B3_B33_B331_B3311': '-',
'B3_B33_B332_B3321': '-',
'B3_B34_B341_B3411': '-',
'B3_B34_B342_B3421': '-',
'B3_B35_B352': [
{
'B3_B35_B352_B3521': 'B352_1',
'B3_B35_B352_B3522': 'B352_2',
'B3_B35_B352_B3523': 'B352_3',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
},
{
'B3_B35_B352_B3521': 'B352_4',
'B3_B35_B352_B3522': 'B352_5',
'B3_B35_B352_B3523': 'B352_6',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
},
{
'B3_B35_B352_B3521': 'B352_7',
'B3_B35_B352_B3522': 'B352_8',
'B3_B35_B352_B3523': 'B352_9',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
}
],
'B3_B36_B361': 'B36_1',
'B3_B36_B362': 'B36_2'},
{'C1': 'B_1',
'C2': 'B_2',
'C3_C31': 'C3_1',
'C3_C32': 'C3_2',
'C3_C33_C331_C3311': '-',
'C3_C33_C332_C3321': '-'}]
现在,我的这个问题目前是这个问题的后续问题。 ,在这里我可以遍历嵌套的 XML 标签并生成输出。
但有一件事是我在处理存在重复 XML 标签的场景时遇到了问题。
使用我当前的代码,我得到了这个输出。
[{'B1': 'B_1',
'B2': 'B_2',
'B3_B31': 'B3_1',
'B3_B32': 'B3_2',
'B3_B33_B331_B3311': '-',
'B3_B33_B332_B3321': '-',
'B3_B34_B341_B3411': '-',
'B3_B34_B342_B3421': '-',
'B3_B35_B351': 'B35_1',
'B3_B35_B352_B3521_B35241': '1',
'B3_B35_B352_B3521_B35242': '2',
'B3_B35_B352_B3521_B35243': '3',
'B3_B35_B353_B3531': 'B353_1',
'B3_B36_B361': 'B36_1',
'B3_B36_B362': 'B36_2',
'duplicate': [{'B3_B35_B352_B3521_B35241': '4',
'B3_B35_B352_B3521_B35242': '5',
'B3_B35_B352_B3521_B35243': '6'},
{'B3_B35_B352_B3521_B35241': '7',
'B3_B35_B352_B3521_B35242': '8',
'B3_B35_B352_B3521_B35243': '9',
'B3_B35_B353_B3532_B35321': 'B3532_3',
'B3_B35_B353_B3532_B35322': 'B3532_4'},
{'B3_B35_B353_B3532_B35321': 'B3532_5',
'B3_B35_B353_B3532_B35322': 'B3532_6'},
{'B3_B35_B353_B3532_B35321': 'B3532_1',
'B3_B35_B353_B3532_B35322': 'B3532_2'}]},
{'C1': 'B_1',
'C2': 'B_2',
'C3_C31': 'C3_1',
'C3_C32': 'C3_2',
'C3_C33_C331_C3311': '-',
'C3_C33_C332_C3321': '-'}]
现在,如果比较预期输出和实际输出,您会发现在重复的 XML 标记处键名不同。同样在重复列表中,XML 标签相互混淆。
我必须单独使用 _handle_duplicates
方法来处理重复的 XML 标签。
这是我现在使用的代码
class ParseXML:
def __init__(self, xml_input):
self.main_output = []
parser = et.XMLParser(recover=True)
self.tree = et.fromstring(re.sub('\s*xmlns(:\w+)?="[^"]*"', '', xml_input), parser=parser)
def parse_xml(self):
for interface in list(self.tree):
temp_output = {}
for children in interface:
temp_list = []
temp_dict = {}
for key, value in self._flatten(children):
if key in temp_output:
if key in temp_dict:
temp_list.append(temp_dict)
temp_dict = {}
temp_dict.update({key: value})
else:
temp_output.update({key: value})
temp = self._handle_duplicates(temp_output, temp_dict, temp_list) if temp_dict else temp_output
self.main_output.append(temp)
return self.main_output
def _flatten(self, node, tags=None):
if tags is None:
tags = []
children = list(node)
if not children:
if node.text is None:
yield '_'.join(tags + [node.tag]), '-'
else:
yield '_'.join(tags + [node.tag]), node.text
else:
for child in children:
for key_val in self._flatten(child, tags + [node.tag]):
yield key_val
def _handle_duplicates(self, temp_output, temp_dict, temp_list):
temp_list.append(temp_dict)
temp = {}
for dup in temp_dict:
temp.update({dup: temp_output.pop(dup)})
temp_list.append(temp)
temp_output.update({'duplicate': temp_list})
return temp_output
if __name__ == '__main__':
parse = ParseXML(data)
output = parse.parse_xml()
pprint(output)
当前代码能够处理重复的 XML 标签,但不是我想要的格式。此外,最好在 _flatten
方法中处理这些重复的 XML 标记,而不是使用不同的 code/method 来处理。
任何人都可以调查一下并为我提供一些处理重复 XML 标签的指导吗?
您可以将 collections.defaultdict
与递归一起使用:
import xml.etree.ElementTree as ET, re, json
from collections import defaultdict
t = ET.fromstring(re.sub('\sxmlns\="\w+"', '', s_xml))
def get_groups(d, p = []):
if not (c:=list(d)):
yield [re.sub('^[A-Z]+_', '', '_'.join(p+[d.tag])), '-' if d.text is None else d.text]
else:
m = defaultdict(int)
for i in c:
m[json.dumps([*get_groups(i, p+[d.tag])])] += 1
for a, b in m.items():
if b == 1:
yield from json.loads(a)
else:
yield [re.sub('^[A-Z]+_', '', '_'.join(p+[d.tag])), [dict(json.loads(a)) for _ in range(b)]]
r = [dict(get_groups(i)) for i in t]
输出:
[{'B1': 'B_1', 'B2': 'B_2', 'B3_B31': 'B3_1', 'B3_B32': 'B3_2', 'B3_B33_B331_B3311': '-', 'B3_B33_B332_B3321': '-', 'B3_B34_B341_B3411': '-', 'B3_B34_B342_B3421': '-', 'B3_B35_B351': 'B35_1', 'B3_B35': [{'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}, {'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}, {'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}], 'B3_B36_B361': 'B36_1', 'B3_B36_B362': 'B36_2'}, {'C1': 'B_1', 'C2': 'B_2', 'C3_C31': 'C3_1', 'C3_C32': 'C3_2', 'C3_C33_C331_C3311': '-', 'C3_C33_C332_C3321': '-'}]
不知何故,我设法实现了我所需要的。但是,我并不为这段代码感到自豪。如果有人为我提供比这更好、更 pythonic 的代码,我会非常高兴。
class ParseXML:
"""
Parsing of an XML section from a string to a flattened dictionary
:param xml_input: XML section from a string
"""
def __init__(self, xml_input):
# Parsing an XML section from a string with also removing `xmlns` tags
self.main_output = []
self.tree = et.fromstring(re.sub('\s*xmlns="[\S]*"', '', xml_input))
def parse_xml(self):
"""
Parsing each XML section and returning the output
:return: Returning flattened dictionary from XML string
"""
# Looping through each interface section
for interface in list(self.tree):
temp_output = {}
# Looping through all the child elements each interface section has
for children in interface:
temp_dup = {}
dup_keys = []
dup_child = []
# Getting flattened key and value from _flatten method
for key, value in self._flatten(children):
# If the key is duplicate, store the duplicate entries into the temp_dup,
# Also store the duplicate keys into dup_keys
if key in temp_output:
# if the key is multi duplicate, store all the duplicate entries into dup_child
if key in dup_keys:
dup_child.append(temp_dup)
temp_dup = {}
dup_keys = []
dup_keys.append(key)
temp_dup.update({key: value})
else:
# If temp_dup is not empty
if temp_dup:
# merge all the duplicate values into a list of dictionaries
temp_output = self._merge_duplicates(dup_keys, temp_output, temp_dup, dup_child)
if dup_child:
dup_child = []
dup_keys = []
temp_dup = {}
# Update the temp_output with the flattened duplicate values
temp_output.update({key: value})
# if the duplicate section is the last one in the XML section
# merge all the duplicate values into a list of dictionaries
if temp_dup:
temp_output = self._merge_duplicates(dup_keys, temp_output, temp_dup, dup_child)
# Update main_output with the values of temp_output
self.main_output.append(temp_output)
return self.main_output
@staticmethod
def common_substr(data):
"""
Getting common xml tag name where duplicate XML tags are present
:param data: containing duplicate XML tags
:return: common xml tag name
"""
output = ''
# Sort the data by lenghtwise
data = sorted(data, key=len)
# Run a loop as per the length of data
for i in range(len(data[0])):
# check if character at the specific position,
# matches will all other entries of data values at the same position
if all(data[0][i] in j[i] for j in data[1:]):
output += data[0][i]
return output.rstrip('_')
def _flatten(self, node, tags=None):
"""
Generating flattened dictionary from a nested XML
:param node: XML tag section
:param tags: XML tag name
:return: dictionary key and value in tuple
"""
if tags is None:
tags = []
children = list(node)
# check if XML tag has children or not
if not children:
# If a XML tag doesn't have any text, replace it with the '-'
if node.text is None:
yield '_'.join(tags + [node.tag]), '-'
else:
yield '_'.join(tags + [node.tag]), node.text
else:
# Looping throught all the children
# call the same function recursively with update tag name
for child in children:
for key_val in self._flatten(child, tags + [node.tag]):
yield key_val
def _merge_duplicates(self, dup_keys, temp_output, temp_dup, dup_child):
"""
Merge duplicate values into the list
:param dup_keys: List containing duplicate keys
:param temp_output: Temporary list holding dictionary of keys and values
:param temp_dup: Temporary dictionary of duplicate keys and values
:param dup_child: List holding dictionaries of duplicate keys and values
:return: Temporty list holding merged duplicate dictionaries into a list of dictionaries
"""
# Get the common key name from the list of dup_keys
common_key = self.common_substr(dup_keys)
# update temp_output with the values of temp_dup
temp_output[common_key] = []
temp_output[common_key].append(temp_dup)
temp_dup = {}
# Looping through all the value of dup_keys,
# Pop key-values from temp_output and store back into temp_dup
for dup_key in dup_keys:
temp_dup[dup_key] = temp_output.pop(dup_key)
# Update temp_output with the values of temp_dup
temp_output[common_key].append(temp_dup)
# If has multiple duplicate entries in dup_child
# adds the dup_child to temp_output
if dup_child:
temp_output[common_key].extend(dup_child)
return temp_output
if __name__ == '__main__':
parse = ParseXML(DATA)
result = parse.parse_xml()
pprint(result)
P.S。 _flatten
方法由@Ajax1234提供。
我有以下 XML 字符串格式的数据,它使用 python 的 lxml
包,我将其解析为 XML.
更新:我已经更新了代码和输出
现在,我必须遍历这个XML数据:
<A xmlns="dfjdlfkdjflsd">
<B>
<B1>B_1</B1>
<B2>B_2</B2>
<B3>
<B31>B3_1</B31>
<B32>B3_2</B32>
<B33>
<B331>
<B3311></B3311>
</B331>
<B332>
<B3321></B3321>
</B332>
</B33>
<B34>
<B341>
<B3411></B3411>
</B341>
<B342>
<B3421></B3421>
</B342>
</B34>
<B35>
<B351>B35_1</B351>
<B352>
<B3521>B352_1</B3521>
<B3522>B352_2</B3522>
<B3523>B352_3</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
<B352>
<B3521>B352_4</B3521>
<B3522>B352_5</B3522>
<B3523>B352_6</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
<B352>
<B3521>B352_7</B3521>
<B3522>B352_8</B3522>
<B3523>B352_9</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
</B35>
<B36>
<B361>B36_1</B361>
<B362>B36_2</B362>
</B36>
</B3>
</B>
<C>
<C1>B_1</C1>
<C2>B_2</C2>
<C3>
<C31>C3_1</C31>
<C32>C3_2</C32>
<C33>
<C331>
<C3311></C3311>
</C331>
<C332>
<C3321></C3321>
</C332>
</C33>
</C3>
</C>
</A>
并生成特定格式的输出,如下所示:
[{'B1': 'B_1',
'B2': 'B_2',
'B3_B31': 'B3_1',
'B3_B32': 'B3_2',
'B3_B33_B331_B3311': '-',
'B3_B33_B332_B3321': '-',
'B3_B34_B341_B3411': '-',
'B3_B34_B342_B3421': '-',
'B3_B35_B352': [
{
'B3_B35_B352_B3521': 'B352_1',
'B3_B35_B352_B3522': 'B352_2',
'B3_B35_B352_B3523': 'B352_3',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
},
{
'B3_B35_B352_B3521': 'B352_4',
'B3_B35_B352_B3522': 'B352_5',
'B3_B35_B352_B3523': 'B352_6',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
},
{
'B3_B35_B352_B3521': 'B352_7',
'B3_B35_B352_B3522': 'B352_8',
'B3_B35_B352_B3523': 'B352_9',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
}
],
'B3_B36_B361': 'B36_1',
'B3_B36_B362': 'B36_2'},
{'C1': 'B_1',
'C2': 'B_2',
'C3_C31': 'C3_1',
'C3_C32': 'C3_2',
'C3_C33_C331_C3311': '-',
'C3_C33_C332_C3321': '-'}]
现在,我的这个问题目前是这个问题的后续问题。
但有一件事是我在处理存在重复 XML 标签的场景时遇到了问题。
使用我当前的代码,我得到了这个输出。
[{'B1': 'B_1',
'B2': 'B_2',
'B3_B31': 'B3_1',
'B3_B32': 'B3_2',
'B3_B33_B331_B3311': '-',
'B3_B33_B332_B3321': '-',
'B3_B34_B341_B3411': '-',
'B3_B34_B342_B3421': '-',
'B3_B35_B351': 'B35_1',
'B3_B35_B352_B3521_B35241': '1',
'B3_B35_B352_B3521_B35242': '2',
'B3_B35_B352_B3521_B35243': '3',
'B3_B35_B353_B3531': 'B353_1',
'B3_B36_B361': 'B36_1',
'B3_B36_B362': 'B36_2',
'duplicate': [{'B3_B35_B352_B3521_B35241': '4',
'B3_B35_B352_B3521_B35242': '5',
'B3_B35_B352_B3521_B35243': '6'},
{'B3_B35_B352_B3521_B35241': '7',
'B3_B35_B352_B3521_B35242': '8',
'B3_B35_B352_B3521_B35243': '9',
'B3_B35_B353_B3532_B35321': 'B3532_3',
'B3_B35_B353_B3532_B35322': 'B3532_4'},
{'B3_B35_B353_B3532_B35321': 'B3532_5',
'B3_B35_B353_B3532_B35322': 'B3532_6'},
{'B3_B35_B353_B3532_B35321': 'B3532_1',
'B3_B35_B353_B3532_B35322': 'B3532_2'}]},
{'C1': 'B_1',
'C2': 'B_2',
'C3_C31': 'C3_1',
'C3_C32': 'C3_2',
'C3_C33_C331_C3311': '-',
'C3_C33_C332_C3321': '-'}]
现在,如果比较预期输出和实际输出,您会发现在重复的 XML 标记处键名不同。同样在重复列表中,XML 标签相互混淆。
我必须单独使用 _handle_duplicates
方法来处理重复的 XML 标签。
这是我现在使用的代码
class ParseXML:
def __init__(self, xml_input):
self.main_output = []
parser = et.XMLParser(recover=True)
self.tree = et.fromstring(re.sub('\s*xmlns(:\w+)?="[^"]*"', '', xml_input), parser=parser)
def parse_xml(self):
for interface in list(self.tree):
temp_output = {}
for children in interface:
temp_list = []
temp_dict = {}
for key, value in self._flatten(children):
if key in temp_output:
if key in temp_dict:
temp_list.append(temp_dict)
temp_dict = {}
temp_dict.update({key: value})
else:
temp_output.update({key: value})
temp = self._handle_duplicates(temp_output, temp_dict, temp_list) if temp_dict else temp_output
self.main_output.append(temp)
return self.main_output
def _flatten(self, node, tags=None):
if tags is None:
tags = []
children = list(node)
if not children:
if node.text is None:
yield '_'.join(tags + [node.tag]), '-'
else:
yield '_'.join(tags + [node.tag]), node.text
else:
for child in children:
for key_val in self._flatten(child, tags + [node.tag]):
yield key_val
def _handle_duplicates(self, temp_output, temp_dict, temp_list):
temp_list.append(temp_dict)
temp = {}
for dup in temp_dict:
temp.update({dup: temp_output.pop(dup)})
temp_list.append(temp)
temp_output.update({'duplicate': temp_list})
return temp_output
if __name__ == '__main__':
parse = ParseXML(data)
output = parse.parse_xml()
pprint(output)
当前代码能够处理重复的 XML 标签,但不是我想要的格式。此外,最好在 _flatten
方法中处理这些重复的 XML 标记,而不是使用不同的 code/method 来处理。
任何人都可以调查一下并为我提供一些处理重复 XML 标签的指导吗?
您可以将 collections.defaultdict
与递归一起使用:
import xml.etree.ElementTree as ET, re, json
from collections import defaultdict
t = ET.fromstring(re.sub('\sxmlns\="\w+"', '', s_xml))
def get_groups(d, p = []):
if not (c:=list(d)):
yield [re.sub('^[A-Z]+_', '', '_'.join(p+[d.tag])), '-' if d.text is None else d.text]
else:
m = defaultdict(int)
for i in c:
m[json.dumps([*get_groups(i, p+[d.tag])])] += 1
for a, b in m.items():
if b == 1:
yield from json.loads(a)
else:
yield [re.sub('^[A-Z]+_', '', '_'.join(p+[d.tag])), [dict(json.loads(a)) for _ in range(b)]]
r = [dict(get_groups(i)) for i in t]
输出:
[{'B1': 'B_1', 'B2': 'B_2', 'B3_B31': 'B3_1', 'B3_B32': 'B3_2', 'B3_B33_B331_B3311': '-', 'B3_B33_B332_B3321': '-', 'B3_B34_B341_B3411': '-', 'B3_B34_B342_B3421': '-', 'B3_B35_B351': 'B35_1', 'B3_B35': [{'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}, {'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}, {'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}], 'B3_B36_B361': 'B36_1', 'B3_B36_B362': 'B36_2'}, {'C1': 'B_1', 'C2': 'B_2', 'C3_C31': 'C3_1', 'C3_C32': 'C3_2', 'C3_C33_C331_C3311': '-', 'C3_C33_C332_C3321': '-'}]
不知何故,我设法实现了我所需要的。但是,我并不为这段代码感到自豪。如果有人为我提供比这更好、更 pythonic 的代码,我会非常高兴。
class ParseXML:
"""
Parsing of an XML section from a string to a flattened dictionary
:param xml_input: XML section from a string
"""
def __init__(self, xml_input):
# Parsing an XML section from a string with also removing `xmlns` tags
self.main_output = []
self.tree = et.fromstring(re.sub('\s*xmlns="[\S]*"', '', xml_input))
def parse_xml(self):
"""
Parsing each XML section and returning the output
:return: Returning flattened dictionary from XML string
"""
# Looping through each interface section
for interface in list(self.tree):
temp_output = {}
# Looping through all the child elements each interface section has
for children in interface:
temp_dup = {}
dup_keys = []
dup_child = []
# Getting flattened key and value from _flatten method
for key, value in self._flatten(children):
# If the key is duplicate, store the duplicate entries into the temp_dup,
# Also store the duplicate keys into dup_keys
if key in temp_output:
# if the key is multi duplicate, store all the duplicate entries into dup_child
if key in dup_keys:
dup_child.append(temp_dup)
temp_dup = {}
dup_keys = []
dup_keys.append(key)
temp_dup.update({key: value})
else:
# If temp_dup is not empty
if temp_dup:
# merge all the duplicate values into a list of dictionaries
temp_output = self._merge_duplicates(dup_keys, temp_output, temp_dup, dup_child)
if dup_child:
dup_child = []
dup_keys = []
temp_dup = {}
# Update the temp_output with the flattened duplicate values
temp_output.update({key: value})
# if the duplicate section is the last one in the XML section
# merge all the duplicate values into a list of dictionaries
if temp_dup:
temp_output = self._merge_duplicates(dup_keys, temp_output, temp_dup, dup_child)
# Update main_output with the values of temp_output
self.main_output.append(temp_output)
return self.main_output
@staticmethod
def common_substr(data):
"""
Getting common xml tag name where duplicate XML tags are present
:param data: containing duplicate XML tags
:return: common xml tag name
"""
output = ''
# Sort the data by lenghtwise
data = sorted(data, key=len)
# Run a loop as per the length of data
for i in range(len(data[0])):
# check if character at the specific position,
# matches will all other entries of data values at the same position
if all(data[0][i] in j[i] for j in data[1:]):
output += data[0][i]
return output.rstrip('_')
def _flatten(self, node, tags=None):
"""
Generating flattened dictionary from a nested XML
:param node: XML tag section
:param tags: XML tag name
:return: dictionary key and value in tuple
"""
if tags is None:
tags = []
children = list(node)
# check if XML tag has children or not
if not children:
# If a XML tag doesn't have any text, replace it with the '-'
if node.text is None:
yield '_'.join(tags + [node.tag]), '-'
else:
yield '_'.join(tags + [node.tag]), node.text
else:
# Looping throught all the children
# call the same function recursively with update tag name
for child in children:
for key_val in self._flatten(child, tags + [node.tag]):
yield key_val
def _merge_duplicates(self, dup_keys, temp_output, temp_dup, dup_child):
"""
Merge duplicate values into the list
:param dup_keys: List containing duplicate keys
:param temp_output: Temporary list holding dictionary of keys and values
:param temp_dup: Temporary dictionary of duplicate keys and values
:param dup_child: List holding dictionaries of duplicate keys and values
:return: Temporty list holding merged duplicate dictionaries into a list of dictionaries
"""
# Get the common key name from the list of dup_keys
common_key = self.common_substr(dup_keys)
# update temp_output with the values of temp_dup
temp_output[common_key] = []
temp_output[common_key].append(temp_dup)
temp_dup = {}
# Looping through all the value of dup_keys,
# Pop key-values from temp_output and store back into temp_dup
for dup_key in dup_keys:
temp_dup[dup_key] = temp_output.pop(dup_key)
# Update temp_output with the values of temp_dup
temp_output[common_key].append(temp_dup)
# If has multiple duplicate entries in dup_child
# adds the dup_child to temp_output
if dup_child:
temp_output[common_key].extend(dup_child)
return temp_output
if __name__ == '__main__':
parse = ParseXML(DATA)
result = parse.parse_xml()
pprint(result)
P.S。 _flatten
方法由@Ajax1234提供。