解析 python 中的多个 xml 文件并将数据附加到 Python DataFrame

Parsing multiple xml files in python and appending the data to a Python DataFrame

我正在尝试从多个嵌套的 xml 文件创建数据框并将数据附加到单个数据框。我知道数据框的结构并定义了它。

tree_list = []
details = ['FirstName','LastName','City','Country','InceptionYear']


for file in bucket_list:
    obj = s3.Object(s3_bucket_name,file)
    data = (obj.get()['Body'].read())
    tree_list.append(ET.ElementTree(ET.fromstring(data)))

def parse_XML(list_of_trees, df_cols): 
    
    for tree in tree_list:
        xroot = tree.getroot()
        rows = []
    
    
    
    for node in xroot: 
        res = []
        for el in df_cols[0:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text)
            else: 
                res.append(None)
        rows.append({df_cols[i-1]: res[i-1] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
        
    return out_df

parse_XML(tree_list,details)

在我的输出数据框中,我得到了上次读取的文件的信息和一些空白行,如下所示:

    FirstName LastName    City     Country
    Ted       Mosbey      Washington  USA
    None      None        None       None
    None      None        None       None

读取所有文件、附加到数据框并删除不需要的行的代码应该做哪些更改?任何有效处理文件的建议都将受到赞赏。

XML 示例:

<PD>
  <Clt>
    <PType>xxxx</PType>
    <PNumber>xxxxx</PNumber>
    <UID>xxxx</UID>
    <TEfd>xxxxx</TEfd>
    <TExd>xxxxxx</TExd>
    <DID>xxxxx</DID>
    <CType>xxxxx</CType>
    <FName>John</FName>
    <MName></MName>
    <LName>Smith</LName>
    <MAL>Home</MAL>
    <AddressLine1>xxxx</AddressLine1>
    <AddressLine2>xxxx</AddressLine2>
    <AddressLine3></AddressLine3>
    <City>xxxx</City>
    <State>xx</State>
    <ZipCode>xxxx</ZipCode>
    <Country>xxxx</Country>
    <Pr>
      <PrType>xxxxx</PrType>
      <PrName>xxxxxx</PrName>
      <PrID>xxxxxx</PrID>
    </Pr>
</Clt>
  <CData>
    <InceptionYear>2021</InceptionYear>
  </CData>
</PD>

所以现在当我有了你的数据样本时,我测试了它并且它对我有用,就像我认为你想要的那样:

def parse_XML(list_of_trees, df_cols):

    def get_el(el_list):
        if len(el_list) > 1:
            return [el_text.text for el_text in el_list]
        else:
            return el_list[0].text
    rows = []
    for tree in list_of_trees:
        xroot = tree.getroot()

        res = {}
        for node in xroot:
            for el in df_cols[0:]:
                if node is not None and node.find(f".//{el}") is not None:
                    el_res = get_el(node.findall(f".//{el}"))
                    if el not in res:
                        res[el] = el_res
                    elif type(res[el]) == list:
                        res[el].extend(el_res)
                    else:
                        res[el] = [res[el], el_res]
        rows.append(res)

    out_df = pd.DataFrame(rows, columns=df_cols)

    return out_df

考虑在 Pandas v1.3.0 中引入的 pandas.read_xml,您在 XML 中在两个不同级别进行解析并将它们连接在一起。

def build_frame(dom): 
    xml_df = (
        pd.read_xml(txt, xpath=".//Clt", parser="etree")
          .reindex(["FirstName", "LastName", "City", "Country"], axis="columns")
          .join(
              pd.read_xml(txt, xpath=".//CData", parser="etree")[['InceptionYear']]
          )
    )
    return xml_df

# BUILD LIST OF PARSED XML DATA FRAMES
df_list = []
for file in bucket_list:
    obj = s3.Object(s3_bucket_name,file)
    data = (obj.get()['Body'].read())
    df_list.append(build_frame(data))

# ALTERNATIVE: BUILD LIST VIA COMPREHENSION
df_list = [
    build_frame(s3.Object(s3_bucket_name, file).get()['Body'].read()))
    for file in bucket_list
]

# COMBINE DATA FRAMES INTO SINGLE MASTER DATA FRAME
master_df = pd.concat(df_list, ignore_index=True)

上面使用了non-default、etree解析器。但是,如果您安装了 lxml(完全支持 XPath 1.0 和 XSLT 1.0 的 third-party 包),您可以进行一次 read_xml 调用(结合 XPath 表达式)并进行一些数据帧清理:

cols = ["FirstName", "LastName", "City", "Country", "InceptionYear"]

def build_frame(dom): 
    xml_df = (
        pd.read_xml(txt, xpath=".//Clt|.//CData", parser="lxml")
          .assign(InceptionYear = lambda x: x["InceptionYear".bfill)  # BACKWARD FILL COLUMN
          .reindex(cols, axis="columns")                              # FILTER COLUMNS
          .query("LastName == LastName")                              # FILTER ROWS (REMOVE NaNs)
    )
    return xml_df