使用路径和值从 xml 文件创建数据框
Create a dataframe from a xml file with the paths and the value
这是来自 xml 文件的数据,
<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/">
<SOAP-ENV:Header />
<SOAP-ENV:Body>
<ADD_LandIndex_001>
<CNTROLAREA>
<BSR>
<status>ADD</status>
<NOUN>LandIndex</NOUN>
<REVISION>001</REVISION>
</BSR>
</CNTROLAREA>
<DATAAREA>
<LandIndex>
<reportId>AMI100031</reportId>
<requestKey>R3278458</requestKey>
<SubmittedBy>EN4871</SubmittedBy>
<submittedOn>2015/01/06 4:20:11 PM</submittedOn>
<LandIndex>
<agreementdetail>
<agreementid>001 4860</agreementid>
<agreementtype>NATURAL GAS</agreementtype>
<currentstatus>
<status>ACTIVE</status>
<statuseffectivedate>1965/02/18</statuseffectivedate>
<termdate>1965/02/18</termdate>
</currentstatus>
<designatedrepresentative></designatedrepresentative>
</agreementdetail>
</LandIndex>
</LandIndex>
</DATAAREA>
</ADD_LandIndex_001>
</SOAP-ENV:Body>
</SOAP-ENV:Envelope
我想保存在数据框中:1) 路径和 2) 与路径对应的元素的文本,并且仅适用于包含值的元素。所以我想要这样的东西:
Path Value
0 Body/ADD_LandIndex_001/CNTROLAREA/BSR/status ADD
1 Body/ADD_LandIndex_001/CNTROLAREA/BSR/NOUN LandIndex
2 Body/ADD_LandIndex_001/CNTROLAREA/BSR/REVISION 001
我的这个小代码不起作用!它 returns 是一个空数据框,但是我可以通过函数循环中的 print(d)
看到它正确地获取了每个元素。我真的不明白有什么问题吗?任何人都可以找到为什么它是空的并且不起作用?
from lxml import etree as et
from collections import defaultdict
import pandas as pd
import os
filename = 'file_try.xml'
namespace = '{http://schemas.xmlsoap.org/soap/envelope/}'
with open(filename, 'rb') as file:
root = et.parse(file).getroot()
tree = et.ElementTree(root)
col_name = ['Path', 'Value']
dataF = pd.DataFrame([],columns = col_name)
def traverse(el,d):
if len(list(el)) > 0:
for child in el:
traverse(child,d)
else:
if el.text is not None:
d = d.append({'Path': tree.getelementpath(el).replace(namespace,''), 'Value' : el.text }, ignore_index = True)
print(d)
return d
df = traverse(root,dataF)
print(df)
df.to_excel("data_2.xlsx")
试试这个。
from simplified_scrapy import SimplifiedDoc, utils
rows = []
rows.append(['Path', 'Value'])
xml = utils.getFileContent('file_try.xml')
doc = SimplifiedDoc(xml)
body = doc.select('SOAP-ENV:Body')
def getPathValue(node, path):
path = path + '/' + node['tag'] # Splicing path
children = node.children
if children:
traverseNodes(children, path)
else:
rows.append([path, node.text])
def traverseNodes(nodes, path):
for node in nodes: # Traversing child nodes
getPathValue(node, path)
traverseNodes(body.children, "Body")
# print(rows)
utils.save2csv('data_2.csv', rows)
结果:
[['Body/ADD_LandIndex_001/CNTROLAREA/BSR/status', 'ADD'], ['Body/ADD_LandIndex_001/CNTROLAREA/BSR/NOUN', 'LandIndex'], ['Body/ADD_LandIndex_001/CNTROLAREA/BSR/REVISION', '001'], ['Body/ADD_LandIndex_001/DATAAREA/LandIndex/reportId', 'AMI100031'], ['Body/ADD_LandIndex_001/DATAAREA/LandIndex/requestKey', 'R3278458'],
...
看了@yazz 的回答才发现我的错误。
代码如下:
from lxml import etree as et
import pandas as pd
import os
filename = 'file_try.xml'
namespace = '{http://schemas.xmlsoap.org/soap/envelope/}'
with open(filename, 'rb') as file:
root = et.parse(file).getroot()
tree = et.ElementTree(root)
col_name = ['Path', 'Value']
data = []
def traverse(el,d):
if len(list(el)) > 0:
for child in el:
traverse(child,d)
else:
if el.text is not None:
d.append([(tree.getelementpath(el)+str(el.xpath('@Ccy'))).replace(namespace,''), el.text])
print(d)
return d
df = pd.DataFrame(traverse(root,data), columns = col_name)
df.to_excel("data_2.xlsx")
这是来自 xml 文件的数据,
<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/">
<SOAP-ENV:Header />
<SOAP-ENV:Body>
<ADD_LandIndex_001>
<CNTROLAREA>
<BSR>
<status>ADD</status>
<NOUN>LandIndex</NOUN>
<REVISION>001</REVISION>
</BSR>
</CNTROLAREA>
<DATAAREA>
<LandIndex>
<reportId>AMI100031</reportId>
<requestKey>R3278458</requestKey>
<SubmittedBy>EN4871</SubmittedBy>
<submittedOn>2015/01/06 4:20:11 PM</submittedOn>
<LandIndex>
<agreementdetail>
<agreementid>001 4860</agreementid>
<agreementtype>NATURAL GAS</agreementtype>
<currentstatus>
<status>ACTIVE</status>
<statuseffectivedate>1965/02/18</statuseffectivedate>
<termdate>1965/02/18</termdate>
</currentstatus>
<designatedrepresentative></designatedrepresentative>
</agreementdetail>
</LandIndex>
</LandIndex>
</DATAAREA>
</ADD_LandIndex_001>
</SOAP-ENV:Body>
</SOAP-ENV:Envelope
我想保存在数据框中:1) 路径和 2) 与路径对应的元素的文本,并且仅适用于包含值的元素。所以我想要这样的东西:
Path Value
0 Body/ADD_LandIndex_001/CNTROLAREA/BSR/status ADD
1 Body/ADD_LandIndex_001/CNTROLAREA/BSR/NOUN LandIndex
2 Body/ADD_LandIndex_001/CNTROLAREA/BSR/REVISION 001
我的这个小代码不起作用!它 returns 是一个空数据框,但是我可以通过函数循环中的 print(d)
看到它正确地获取了每个元素。我真的不明白有什么问题吗?任何人都可以找到为什么它是空的并且不起作用?
from lxml import etree as et
from collections import defaultdict
import pandas as pd
import os
filename = 'file_try.xml'
namespace = '{http://schemas.xmlsoap.org/soap/envelope/}'
with open(filename, 'rb') as file:
root = et.parse(file).getroot()
tree = et.ElementTree(root)
col_name = ['Path', 'Value']
dataF = pd.DataFrame([],columns = col_name)
def traverse(el,d):
if len(list(el)) > 0:
for child in el:
traverse(child,d)
else:
if el.text is not None:
d = d.append({'Path': tree.getelementpath(el).replace(namespace,''), 'Value' : el.text }, ignore_index = True)
print(d)
return d
df = traverse(root,dataF)
print(df)
df.to_excel("data_2.xlsx")
试试这个。
from simplified_scrapy import SimplifiedDoc, utils
rows = []
rows.append(['Path', 'Value'])
xml = utils.getFileContent('file_try.xml')
doc = SimplifiedDoc(xml)
body = doc.select('SOAP-ENV:Body')
def getPathValue(node, path):
path = path + '/' + node['tag'] # Splicing path
children = node.children
if children:
traverseNodes(children, path)
else:
rows.append([path, node.text])
def traverseNodes(nodes, path):
for node in nodes: # Traversing child nodes
getPathValue(node, path)
traverseNodes(body.children, "Body")
# print(rows)
utils.save2csv('data_2.csv', rows)
结果:
[['Body/ADD_LandIndex_001/CNTROLAREA/BSR/status', 'ADD'], ['Body/ADD_LandIndex_001/CNTROLAREA/BSR/NOUN', 'LandIndex'], ['Body/ADD_LandIndex_001/CNTROLAREA/BSR/REVISION', '001'], ['Body/ADD_LandIndex_001/DATAAREA/LandIndex/reportId', 'AMI100031'], ['Body/ADD_LandIndex_001/DATAAREA/LandIndex/requestKey', 'R3278458'],
...
看了@yazz 的回答才发现我的错误。
代码如下:
from lxml import etree as et
import pandas as pd
import os
filename = 'file_try.xml'
namespace = '{http://schemas.xmlsoap.org/soap/envelope/}'
with open(filename, 'rb') as file:
root = et.parse(file).getroot()
tree = et.ElementTree(root)
col_name = ['Path', 'Value']
data = []
def traverse(el,d):
if len(list(el)) > 0:
for child in el:
traverse(child,d)
else:
if el.text is not None:
d.append([(tree.getelementpath(el)+str(el.xpath('@Ccy'))).replace(namespace,''), el.text])
print(d)
return d
df = pd.DataFrame(traverse(root,data), columns = col_name)
df.to_excel("data_2.xlsx")