如何将 XML 文件转换为 pandas 数据帧?
How to convert an XML file to pandas dataframe?
我无法将 XML 获取到 python 数据帧
你能帮我解析 XML 到 python 数据帧吗?
我似乎无法让它工作
这是我达到的程度:
import xmltodict
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_xml():
url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
headers = {'content-type': 'application/soap+xml; charset=utf-8'}
body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
<soap12:Body>
<GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" />
</soap12:Body>
</soap12:Envelope>"""
response = requests.post(url,data=body,headers=headers)
return response.content
response = get_xml()
soup = BeautifulSoup(response, 'lxml')
table_columns = []
for item in soup.find_all(['EDPObjectName'.lower()]):
table_columns.append(item.text)
table_columns=pd.DataFrame(table_columns)
table_rows=[]
for item in soup.find_all(['applicableat']):
table_rows.append(item.text)
df1=pd.DataFrame(table_rows).drop_duplicates()
#df1=pd.to_datetime(df1)
table=[]
for item in soup.find_all(['flowrate']):
table.append(item.text)
df=pd.DataFrame(table)
df_final=pd.DataFrame(df, columns=table_columns, index=df1)
这是我要找的结果:
ALDBROUGH AVONMOUTH BACTON BBL …
2019-08-08T13:00:00 0 1.23 5.1 …
2019-08-08T13:02:00 0 1.23 5.1 …
2019-08-08T13:04:00 0 3.23 5.1 …
2019-08-08T13:06:00 0 3.23 5.1 …
2019-08-08T13:08:00 0 3.23 5.23 …
2019-08-08T13:10:00 0 4.23 5.204 …
这个问题与其他 xml 解析问题非常相似,因为您有一个分层数据结构,您需要将其展平。我提出的解决方案将时间戳、位置和流量转换为列,并使每个日志条目成为一行。我还遵循了简约原则,即我尝试以某种方式解析 xml,以便我的扁平数据已经采用最容易转换为数据帧的格式。变量 'data' 具有字典结构,每列一个键。字典中的值是数据列表,列表中的每个位置表示该条目属于哪一行:
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_xml():
url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
headers = {'content-type': 'application/soap+xml; charset=utf-8'}
body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope"><soap12:Body><GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" /></soap12:Body></soap12:Envelope>"""
response = requests.post(url,data=body,headers=headers)
return response.content
response = get_xml()
soup = BeautifulSoup(response, 'lxml')
data = {'timestamp':[], 'place':[], 'flowrate':[]}
for group in soup.find_all('edpobjectbe'):
place = group.find('edpobjectname').text
for xml in group.find_all('edpenergydatabe'):
data['place'].append(place)
data['timestamp'].append(xml.find('applicableat').text)
data['flowrate'].append(xml.find('flowrate').text)
df = pd.DataFrame(data)
df
请注意,我在父元素 'edpenergydatabe' 上执行 find_all() 操作,因此我可以确定时间戳适用于与其关联的所有条目。如果您想要不同的行和列排列方式,您现在可以使用 pandas 函数(如 transpose())来实现。我希望这可以帮助您走上正轨!
尝试使用:
from bs4 import BeautifulSoup
import pandas as pd
name_list = []
prev_df = pd.DataFrame(columns=['time'])
response = BeautifulSoup(get_xml(), 'lxml')
for x in response.find_all('edpobjectbe'):
list_small = list()
name = str(x.find('edpobjectname').text).strip()
name_list.append(name)
data = x.find_all('edpenergydatabe')
print(name)
for y in data:
applicableat = str(y.find('applicableat').text).strip()
flowrate = str(y.find('flowrate').text).strip()
list_small.append([applicableat, flowrate])
df = pd.DataFrame(list_small, columns=['time', name])
prev_df = pd.DataFrame.merge(prev_df, df, how='right', on='time')
print(prev_df)
检查这是否适合你!!!
我无法将 XML 获取到 python 数据帧
你能帮我解析 XML 到 python 数据帧吗? 我似乎无法让它工作 这是我达到的程度:
import xmltodict
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_xml():
url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
headers = {'content-type': 'application/soap+xml; charset=utf-8'}
body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
<soap12:Body>
<GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" />
</soap12:Body>
</soap12:Envelope>"""
response = requests.post(url,data=body,headers=headers)
return response.content
response = get_xml()
soup = BeautifulSoup(response, 'lxml')
table_columns = []
for item in soup.find_all(['EDPObjectName'.lower()]):
table_columns.append(item.text)
table_columns=pd.DataFrame(table_columns)
table_rows=[]
for item in soup.find_all(['applicableat']):
table_rows.append(item.text)
df1=pd.DataFrame(table_rows).drop_duplicates()
#df1=pd.to_datetime(df1)
table=[]
for item in soup.find_all(['flowrate']):
table.append(item.text)
df=pd.DataFrame(table)
df_final=pd.DataFrame(df, columns=table_columns, index=df1)
这是我要找的结果:
ALDBROUGH AVONMOUTH BACTON BBL …
2019-08-08T13:00:00 0 1.23 5.1 …
2019-08-08T13:02:00 0 1.23 5.1 …
2019-08-08T13:04:00 0 3.23 5.1 …
2019-08-08T13:06:00 0 3.23 5.1 …
2019-08-08T13:08:00 0 3.23 5.23 …
2019-08-08T13:10:00 0 4.23 5.204 …
这个问题与其他 xml 解析问题非常相似,因为您有一个分层数据结构,您需要将其展平。我提出的解决方案将时间戳、位置和流量转换为列,并使每个日志条目成为一行。我还遵循了简约原则,即我尝试以某种方式解析 xml,以便我的扁平数据已经采用最容易转换为数据帧的格式。变量 'data' 具有字典结构,每列一个键。字典中的值是数据列表,列表中的每个位置表示该条目属于哪一行:
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_xml():
url="http://energywatch.natgrid.co.uk/EDP-PublicUI/PublicPI/InstantaneousFlowWebService.asmx"
headers = {'content-type': 'application/soap+xml; charset=utf-8'}
body ="""<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope"><soap12:Body><GetInstantaneousFlowData xmlns="http://www.NationalGrid.com/EDP/UI/" /></soap12:Body></soap12:Envelope>"""
response = requests.post(url,data=body,headers=headers)
return response.content
response = get_xml()
soup = BeautifulSoup(response, 'lxml')
data = {'timestamp':[], 'place':[], 'flowrate':[]}
for group in soup.find_all('edpobjectbe'):
place = group.find('edpobjectname').text
for xml in group.find_all('edpenergydatabe'):
data['place'].append(place)
data['timestamp'].append(xml.find('applicableat').text)
data['flowrate'].append(xml.find('flowrate').text)
df = pd.DataFrame(data)
df
请注意,我在父元素 'edpenergydatabe' 上执行 find_all() 操作,因此我可以确定时间戳适用于与其关联的所有条目。如果您想要不同的行和列排列方式,您现在可以使用 pandas 函数(如 transpose())来实现。我希望这可以帮助您走上正轨!
尝试使用:
from bs4 import BeautifulSoup
import pandas as pd
name_list = []
prev_df = pd.DataFrame(columns=['time'])
response = BeautifulSoup(get_xml(), 'lxml')
for x in response.find_all('edpobjectbe'):
list_small = list()
name = str(x.find('edpobjectname').text).strip()
name_list.append(name)
data = x.find_all('edpenergydatabe')
print(name)
for y in data:
applicableat = str(y.find('applicableat').text).strip()
flowrate = str(y.find('flowrate').text).strip()
list_small.append([applicableat, flowrate])
df = pd.DataFrame(list_small, columns=['time', name])
prev_df = pd.DataFrame.merge(prev_df, df, how='right', on='time')
print(prev_df)
检查这是否适合你!!!