使用 Elementree 从另一个 child 元素读取
Reading from another child element using Elementree
下面的代码遍历 xml 个文件并将它们解析为单个 csv 文件
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
from pathlib import Path
directory = 'path to a folder with xml files'
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str, Path(directory).glob('**/*.xml')))
print(xml_files_list)
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
repeated_values = dict()
for k,v in sn.attrib.items():
repeated_values[k] = v
for rn in sn.findall('.//Rational'):
repeated_values['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
repeated_values['qualify'] = qu.text
for ds in sn.findall('.//Description'):
repeated_values['description_txt'] = ds.text
repeated_values['description_num'] = ds.attrib['num']
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
for key in repeated_values.keys():
row[key] = repeated_values[key]
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
这是 xml 文件。
<?xml version="1.0" encoding="utf-8"?>
<ProjectData>
<Phones>
<Date />
<Prog />
<Box />
<Feature />
<IN>MAFWDS</IN>
<Set>234234</Set>
<Pr>23423</Pr>
<Number>afasfhrtv</Number>
<Simple>dfasd</Simple>
<Nr />
<Get>6070106091</Get>
<Reno>1233</Reno>
</Phones>
<FINAL>
<START id="B001" service_code="0x5196">
<Docs Docs_type="START">
<Rational>225196</Rational>
<Qualify>6251960000A0DE</Qualify>
</Docs>
<Description num="1213f2312">The parameter</Description>
<DataFile dg="12" dg_id="let">
<SetData value="32" />
</DataFile>
</START>
<START id="C003" service_code="0x517B">
<Docs Docs_type="START">
<Rational>23423</Rational>
<Qualify>342342</Qualify>
</Docs>
<Description num="3423423f3423">The third</Description>
<DataFile dg="55" dg_id="big">
<SetData x="E1" value="21259" />
<SetData x="E2" value="02" />
</DataFile>
</START>
<START id="Z048" service_code="0x5198">
<RawData rawdata_type="ASDS">
<Rational>225198</Rational>
<Qualify>343243324234234</Qualify>
</RawData>
<Description num="434234234">The forth</Description>
<DataFile unit="21" unit_id="FEDS">
<FileX unit="eg" discrete="false" axis_pts="19" name="Vsome" text_id="bx5" unit_id="GDFSD" />
<SetData xin="5" xax="233" value="323" />
<SetData xin="123" xax="77" value="555" />
<SetData xin="17" xax="65" value="23" />
</DataFile>
</START>
</FINAL>
</ProjectData>
这是输出的样子
目前正在努力修改代码,所以它转到 Phones(这是 Projectdata 的另一个 child)从 Set 和 Get 将它们与 _ 附加在一起,并将它们解析到具有 header 名称的第一列**识别* *
下面的图片显示了它的外观。
将您的 headers
行修改为
headers = ['identify', 'id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
p_get = tree.find('.//Phones/Get').text
p_set = tree.find('.//Phones/Set').text
并将此信息添加到 row_data
行 writer.writerow(row_data)
之前
像这样:
row_data.insert(0, p_get + '_' + p_set)
更新
row_data[0] = p_get + '_' + p_set
下面的代码遍历 xml 个文件并将它们解析为单个 csv 文件
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
from pathlib import Path
directory = 'path to a folder with xml files'
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str, Path(directory).glob('**/*.xml')))
print(xml_files_list)
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
repeated_values = dict()
for k,v in sn.attrib.items():
repeated_values[k] = v
for rn in sn.findall('.//Rational'):
repeated_values['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
repeated_values['qualify'] = qu.text
for ds in sn.findall('.//Description'):
repeated_values['description_txt'] = ds.text
repeated_values['description_num'] = ds.attrib['num']
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
for key in repeated_values.keys():
row[key] = repeated_values[key]
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
这是 xml 文件。
<?xml version="1.0" encoding="utf-8"?>
<ProjectData>
<Phones>
<Date />
<Prog />
<Box />
<Feature />
<IN>MAFWDS</IN>
<Set>234234</Set>
<Pr>23423</Pr>
<Number>afasfhrtv</Number>
<Simple>dfasd</Simple>
<Nr />
<Get>6070106091</Get>
<Reno>1233</Reno>
</Phones>
<FINAL>
<START id="B001" service_code="0x5196">
<Docs Docs_type="START">
<Rational>225196</Rational>
<Qualify>6251960000A0DE</Qualify>
</Docs>
<Description num="1213f2312">The parameter</Description>
<DataFile dg="12" dg_id="let">
<SetData value="32" />
</DataFile>
</START>
<START id="C003" service_code="0x517B">
<Docs Docs_type="START">
<Rational>23423</Rational>
<Qualify>342342</Qualify>
</Docs>
<Description num="3423423f3423">The third</Description>
<DataFile dg="55" dg_id="big">
<SetData x="E1" value="21259" />
<SetData x="E2" value="02" />
</DataFile>
</START>
<START id="Z048" service_code="0x5198">
<RawData rawdata_type="ASDS">
<Rational>225198</Rational>
<Qualify>343243324234234</Qualify>
</RawData>
<Description num="434234234">The forth</Description>
<DataFile unit="21" unit_id="FEDS">
<FileX unit="eg" discrete="false" axis_pts="19" name="Vsome" text_id="bx5" unit_id="GDFSD" />
<SetData xin="5" xax="233" value="323" />
<SetData xin="123" xax="77" value="555" />
<SetData xin="17" xax="65" value="23" />
</DataFile>
</START>
</FINAL>
</ProjectData>
这是输出的样子
目前正在努力修改代码,所以它转到 Phones(这是 Projectdata 的另一个 child)从 Set 和 Get 将它们与 _ 附加在一起,并将它们解析到具有 header 名称的第一列**识别* * 下面的图片显示了它的外观。
将您的 headers
行修改为
headers = ['identify', 'id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
p_get = tree.find('.//Phones/Get').text
p_set = tree.find('.//Phones/Set').text
并将此信息添加到 row_data
行 writer.writerow(row_data)
像这样:
row_data.insert(0, p_get + '_' + p_set)
更新
row_data[0] = p_get + '_' + p_set