解析 python 中的多个 xml 文件并将数据附加到 Python DataFrame
Parsing multiple xml files in python and appending the data to a Python DataFrame
我正在尝试从多个嵌套的 xml 文件创建数据框并将数据附加到单个数据框。我知道数据框的结构并定义了它。
tree_list = []
details = ['FirstName','LastName','City','Country','InceptionYear']
for file in bucket_list:
obj = s3.Object(s3_bucket_name,file)
data = (obj.get()['Body'].read())
tree_list.append(ET.ElementTree(ET.fromstring(data)))
def parse_XML(list_of_trees, df_cols):
for tree in tree_list:
xroot = tree.getroot()
rows = []
for node in xroot:
res = []
for el in df_cols[0:]:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i-1]: res[i-1]
for i, _ in enumerate(df_cols)})
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
parse_XML(tree_list,details)
在我的输出数据框中,我得到了上次读取的文件的信息和一些空白行,如下所示:
FirstName LastName City Country
Ted Mosbey Washington USA
None None None None
None None None None
读取所有文件、附加到数据框并删除不需要的行的代码应该做哪些更改?任何有效处理文件的建议都将受到赞赏。
XML 示例:
<PD>
<Clt>
<PType>xxxx</PType>
<PNumber>xxxxx</PNumber>
<UID>xxxx</UID>
<TEfd>xxxxx</TEfd>
<TExd>xxxxxx</TExd>
<DID>xxxxx</DID>
<CType>xxxxx</CType>
<FName>John</FName>
<MName></MName>
<LName>Smith</LName>
<MAL>Home</MAL>
<AddressLine1>xxxx</AddressLine1>
<AddressLine2>xxxx</AddressLine2>
<AddressLine3></AddressLine3>
<City>xxxx</City>
<State>xx</State>
<ZipCode>xxxx</ZipCode>
<Country>xxxx</Country>
<Pr>
<PrType>xxxxx</PrType>
<PrName>xxxxxx</PrName>
<PrID>xxxxxx</PrID>
</Pr>
</Clt>
<CData>
<InceptionYear>2021</InceptionYear>
</CData>
</PD>
所以现在当我有了你的数据样本时,我测试了它并且它对我有用,就像我认为你想要的那样:
def parse_XML(list_of_trees, df_cols):
def get_el(el_list):
if len(el_list) > 1:
return [el_text.text for el_text in el_list]
else:
return el_list[0].text
rows = []
for tree in list_of_trees:
xroot = tree.getroot()
res = {}
for node in xroot:
for el in df_cols[0:]:
if node is not None and node.find(f".//{el}") is not None:
el_res = get_el(node.findall(f".//{el}"))
if el not in res:
res[el] = el_res
elif type(res[el]) == list:
res[el].extend(el_res)
else:
res[el] = [res[el], el_res]
rows.append(res)
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
考虑在 Pandas v1.3.0 中引入的 pandas.read_xml
,您在 XML 中在两个不同级别进行解析并将它们连接在一起。
def build_frame(dom):
xml_df = (
pd.read_xml(txt, xpath=".//Clt", parser="etree")
.reindex(["FirstName", "LastName", "City", "Country"], axis="columns")
.join(
pd.read_xml(txt, xpath=".//CData", parser="etree")[['InceptionYear']]
)
)
return xml_df
# BUILD LIST OF PARSED XML DATA FRAMES
df_list = []
for file in bucket_list:
obj = s3.Object(s3_bucket_name,file)
data = (obj.get()['Body'].read())
df_list.append(build_frame(data))
# ALTERNATIVE: BUILD LIST VIA COMPREHENSION
df_list = [
build_frame(s3.Object(s3_bucket_name, file).get()['Body'].read()))
for file in bucket_list
]
# COMBINE DATA FRAMES INTO SINGLE MASTER DATA FRAME
master_df = pd.concat(df_list, ignore_index=True)
上面使用了non-default、etree
解析器。但是,如果您安装了 lxml
(完全支持 XPath 1.0 和 XSLT 1.0 的 third-party 包),您可以进行一次 read_xml
调用(结合 XPath 表达式)并进行一些数据帧清理:
cols = ["FirstName", "LastName", "City", "Country", "InceptionYear"]
def build_frame(dom):
xml_df = (
pd.read_xml(txt, xpath=".//Clt|.//CData", parser="lxml")
.assign(InceptionYear = lambda x: x["InceptionYear".bfill) # BACKWARD FILL COLUMN
.reindex(cols, axis="columns") # FILTER COLUMNS
.query("LastName == LastName") # FILTER ROWS (REMOVE NaNs)
)
return xml_df
我正在尝试从多个嵌套的 xml 文件创建数据框并将数据附加到单个数据框。我知道数据框的结构并定义了它。
tree_list = []
details = ['FirstName','LastName','City','Country','InceptionYear']
for file in bucket_list:
obj = s3.Object(s3_bucket_name,file)
data = (obj.get()['Body'].read())
tree_list.append(ET.ElementTree(ET.fromstring(data)))
def parse_XML(list_of_trees, df_cols):
for tree in tree_list:
xroot = tree.getroot()
rows = []
for node in xroot:
res = []
for el in df_cols[0:]:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i-1]: res[i-1]
for i, _ in enumerate(df_cols)})
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
parse_XML(tree_list,details)
在我的输出数据框中,我得到了上次读取的文件的信息和一些空白行,如下所示:
FirstName LastName City Country
Ted Mosbey Washington USA
None None None None
None None None None
读取所有文件、附加到数据框并删除不需要的行的代码应该做哪些更改?任何有效处理文件的建议都将受到赞赏。
XML 示例:
<PD>
<Clt>
<PType>xxxx</PType>
<PNumber>xxxxx</PNumber>
<UID>xxxx</UID>
<TEfd>xxxxx</TEfd>
<TExd>xxxxxx</TExd>
<DID>xxxxx</DID>
<CType>xxxxx</CType>
<FName>John</FName>
<MName></MName>
<LName>Smith</LName>
<MAL>Home</MAL>
<AddressLine1>xxxx</AddressLine1>
<AddressLine2>xxxx</AddressLine2>
<AddressLine3></AddressLine3>
<City>xxxx</City>
<State>xx</State>
<ZipCode>xxxx</ZipCode>
<Country>xxxx</Country>
<Pr>
<PrType>xxxxx</PrType>
<PrName>xxxxxx</PrName>
<PrID>xxxxxx</PrID>
</Pr>
</Clt>
<CData>
<InceptionYear>2021</InceptionYear>
</CData>
</PD>
所以现在当我有了你的数据样本时,我测试了它并且它对我有用,就像我认为你想要的那样:
def parse_XML(list_of_trees, df_cols):
def get_el(el_list):
if len(el_list) > 1:
return [el_text.text for el_text in el_list]
else:
return el_list[0].text
rows = []
for tree in list_of_trees:
xroot = tree.getroot()
res = {}
for node in xroot:
for el in df_cols[0:]:
if node is not None and node.find(f".//{el}") is not None:
el_res = get_el(node.findall(f".//{el}"))
if el not in res:
res[el] = el_res
elif type(res[el]) == list:
res[el].extend(el_res)
else:
res[el] = [res[el], el_res]
rows.append(res)
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
考虑在 Pandas v1.3.0 中引入的 pandas.read_xml
,您在 XML 中在两个不同级别进行解析并将它们连接在一起。
def build_frame(dom):
xml_df = (
pd.read_xml(txt, xpath=".//Clt", parser="etree")
.reindex(["FirstName", "LastName", "City", "Country"], axis="columns")
.join(
pd.read_xml(txt, xpath=".//CData", parser="etree")[['InceptionYear']]
)
)
return xml_df
# BUILD LIST OF PARSED XML DATA FRAMES
df_list = []
for file in bucket_list:
obj = s3.Object(s3_bucket_name,file)
data = (obj.get()['Body'].read())
df_list.append(build_frame(data))
# ALTERNATIVE: BUILD LIST VIA COMPREHENSION
df_list = [
build_frame(s3.Object(s3_bucket_name, file).get()['Body'].read()))
for file in bucket_list
]
# COMBINE DATA FRAMES INTO SINGLE MASTER DATA FRAME
master_df = pd.concat(df_list, ignore_index=True)
上面使用了non-default、etree
解析器。但是,如果您安装了 lxml
(完全支持 XPath 1.0 和 XSLT 1.0 的 third-party 包),您可以进行一次 read_xml
调用(结合 XPath 表达式)并进行一些数据帧清理:
cols = ["FirstName", "LastName", "City", "Country", "InceptionYear"]
def build_frame(dom):
xml_df = (
pd.read_xml(txt, xpath=".//Clt|.//CData", parser="lxml")
.assign(InceptionYear = lambda x: x["InceptionYear".bfill) # BACKWARD FILL COLUMN
.reindex(cols, axis="columns") # FILTER COLUMNS
.query("LastName == LastName") # FILTER ROWS (REMOVE NaNs)
)
return xml_df