XML 导入 DataFrame Python
XML import to DataFrame Python
我有 XML 文件,如下所示:
我正在尝试将它转换为 Python 中的 DF,使用这个:
import pandas as pd
import xml.etree.ElementTree as et
def parse_XML(xml_file, df_cols):
"""Parse the input XML file and store the result in a pandas
DataFrame with the given columns.
The first element of df_cols is supposed to be the identifier
variable, which is an attribute of each node element in the
XML data; other features will be parsed from the text content
of each sub-element.
"""
xtree = et.parse(xml_file)
xroot = xtree.getroot()
rows = []
for node in xroot:
res = []
res.append(node.attrib.get(df_cols[0]))
for el in df_cols[1:]:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i]: res[i]
for i, _ in enumerate(df_cols)})
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
我正在使用这样的函数:
parse_XML(R'C:\Users\aleks\Desktop\zadania python+sql\import_xml.xml', ['Year', 'Row', 'New', 'Used', 'Total Sales New', 'Total Sales Used'])
但结果而不是完全导入的 XML 文件是这样的:
如何让它工作,以便将完整的 XML 文件导入 DataFrame?谢谢。
由于您的 XML 相对平坦,请考虑将 merging dictionaries 的元素和属性数据传递到 pandas.DataFrame
构造函数中。具体来说,字典是通过 list/dict 理解标签和 row
元素和 attrib
字典的文本解析得出的。
xtree = et.parse(xml_file)
# LIST OF DICTIONARIES
data = [{ **{r.tag:r.text for r in row.findall('*')}, **row.attrib
} for row in xtree.findall('.//row/row')]
out_df = pd.DataFrame(data)
从OpenData Michigan MVA Vehicle Sales Counts(类似于OP的图像数据)进行演示:
import pandas as pd
import xml.etree.ElementTree as et
from urllib import request
url = "https://opendata.maryland.gov/api/views/un65-7ipd/rows.xml?accessType=DOWNLOAD"
rq = request.urlopen(url)
xtree = et.fromstring(rq.read())
# LIST OF DICTIONARIES
data = [{ **{r.tag:r.text for r in row.findall('*')}, **row.attrib
} for row in xtree.findall('.//row/row')]
out_df = pd.DataFrame(data)
输出
out_df
# year month new used total_sales_new total_sales_used _id _uuid _position _address
#0 2002 JAN 31106 49927 755015820 386481929 row-a49t-ihm3.rpap 00000000-0000-0000-05D6-978893D301BF 0 https://opendata.maryland.gov/resource/un65-7i...
#1 2002 FEB 27520 50982 664454223 361353242 row-njtn_crc8_qawh 00000000-0000-0000-3211-E4A718B8A756 0 https://opendata.maryland.gov/resource/un65-7i...
#2 2002 MAR 34225 58794 805666244 419385387 row-tn8d-5xax-gcqz 00000000-0000-0000-9FC0-AB50DEE551EE 0 https://opendata.maryland.gov/resource/un65-7i...
#3 2002 APR 36452 59817 846368297 433061150 row-bjnm-jnr7.7rbw 00000000-0000-0000-FE23-E2E416AC13DB 0 https://opendata.maryland.gov/resource/un65-7i...
#4 2002 MAY 37359 60577 855005784 442569410 row-5wkh~fkp3.mzuj 00000000-0000-0000-CBC6-6E4FF3707E07 0 https://opendata.maryland.gov/resource/un65-7i...
#.. ... ... ... ... ... ... ... ... ... ...
#221 2020 JUN 23704 63395 821297823 699671992 row-m9ah.mpqr.7eww 00000000-0000-0000-3599-5C2EEABDA225 0 https://opendata.maryland.gov/resource/un65-7i...
#222 2020 JUL 22748 65652 784818757 608761285 row-79ht-rw2j~j4vy 00000000-0000-0000-4E88-6D5061A28AC5 0 https://opendata.maryland.gov/resource/un65-7i...
#223 2020 AUG 12640 48594 456451505 329492612 row-yh7w-48wu_vpnx 00000000-0000-0000-6ED5-CA4A564695EA 0 https://opendata.maryland.gov/resource/un65-7i...
#224 2020 SEP 11336 42148 395164402 274497385 row-e2ur-7zck~4rdd 00000000-0000-0000-C640-6EE6DAA0F07C 0 https://opendata.maryland.gov/resource/un65-7i...
#225 2020 OCT 13271 43827 477111877 328752668 row-yn9u~un9g_3heh 00000000-0000-0000-4EEE-250E14B81104 0 https://opendata.maryland.gov/resource/un65-7i...
我有 XML 文件,如下所示:
我正在尝试将它转换为 Python 中的 DF,使用这个:
import pandas as pd
import xml.etree.ElementTree as et
def parse_XML(xml_file, df_cols):
"""Parse the input XML file and store the result in a pandas
DataFrame with the given columns.
The first element of df_cols is supposed to be the identifier
variable, which is an attribute of each node element in the
XML data; other features will be parsed from the text content
of each sub-element.
"""
xtree = et.parse(xml_file)
xroot = xtree.getroot()
rows = []
for node in xroot:
res = []
res.append(node.attrib.get(df_cols[0]))
for el in df_cols[1:]:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i]: res[i]
for i, _ in enumerate(df_cols)})
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
我正在使用这样的函数:
parse_XML(R'C:\Users\aleks\Desktop\zadania python+sql\import_xml.xml', ['Year', 'Row', 'New', 'Used', 'Total Sales New', 'Total Sales Used'])
但结果而不是完全导入的 XML 文件是这样的:
如何让它工作,以便将完整的 XML 文件导入 DataFrame?谢谢。
由于您的 XML 相对平坦,请考虑将 merging dictionaries 的元素和属性数据传递到 pandas.DataFrame
构造函数中。具体来说,字典是通过 list/dict 理解标签和 row
元素和 attrib
字典的文本解析得出的。
xtree = et.parse(xml_file)
# LIST OF DICTIONARIES
data = [{ **{r.tag:r.text for r in row.findall('*')}, **row.attrib
} for row in xtree.findall('.//row/row')]
out_df = pd.DataFrame(data)
从OpenData Michigan MVA Vehicle Sales Counts(类似于OP的图像数据)进行演示:
import pandas as pd
import xml.etree.ElementTree as et
from urllib import request
url = "https://opendata.maryland.gov/api/views/un65-7ipd/rows.xml?accessType=DOWNLOAD"
rq = request.urlopen(url)
xtree = et.fromstring(rq.read())
# LIST OF DICTIONARIES
data = [{ **{r.tag:r.text for r in row.findall('*')}, **row.attrib
} for row in xtree.findall('.//row/row')]
out_df = pd.DataFrame(data)
输出
out_df
# year month new used total_sales_new total_sales_used _id _uuid _position _address
#0 2002 JAN 31106 49927 755015820 386481929 row-a49t-ihm3.rpap 00000000-0000-0000-05D6-978893D301BF 0 https://opendata.maryland.gov/resource/un65-7i...
#1 2002 FEB 27520 50982 664454223 361353242 row-njtn_crc8_qawh 00000000-0000-0000-3211-E4A718B8A756 0 https://opendata.maryland.gov/resource/un65-7i...
#2 2002 MAR 34225 58794 805666244 419385387 row-tn8d-5xax-gcqz 00000000-0000-0000-9FC0-AB50DEE551EE 0 https://opendata.maryland.gov/resource/un65-7i...
#3 2002 APR 36452 59817 846368297 433061150 row-bjnm-jnr7.7rbw 00000000-0000-0000-FE23-E2E416AC13DB 0 https://opendata.maryland.gov/resource/un65-7i...
#4 2002 MAY 37359 60577 855005784 442569410 row-5wkh~fkp3.mzuj 00000000-0000-0000-CBC6-6E4FF3707E07 0 https://opendata.maryland.gov/resource/un65-7i...
#.. ... ... ... ... ... ... ... ... ... ...
#221 2020 JUN 23704 63395 821297823 699671992 row-m9ah.mpqr.7eww 00000000-0000-0000-3599-5C2EEABDA225 0 https://opendata.maryland.gov/resource/un65-7i...
#222 2020 JUL 22748 65652 784818757 608761285 row-79ht-rw2j~j4vy 00000000-0000-0000-4E88-6D5061A28AC5 0 https://opendata.maryland.gov/resource/un65-7i...
#223 2020 AUG 12640 48594 456451505 329492612 row-yh7w-48wu_vpnx 00000000-0000-0000-6ED5-CA4A564695EA 0 https://opendata.maryland.gov/resource/un65-7i...
#224 2020 SEP 11336 42148 395164402 274497385 row-e2ur-7zck~4rdd 00000000-0000-0000-C640-6EE6DAA0F07C 0 https://opendata.maryland.gov/resource/un65-7i...
#225 2020 OCT 13271 43827 477111877 328752668 row-yn9u~un9g_3heh 00000000-0000-0000-4EEE-250E14B81104 0 https://opendata.maryland.gov/resource/un65-7i...