如何将 xmlTree iterparse 应用于嵌套 XML 集
How to apply xmlTree iterparse to nested XML set
我正在尝试从这个 tutorial 中复制示例,但是将 iterparse 与 elem.clear() 结合使用。
XML 示例:
<?xml version="1.0" encoding="UTF-8"?>
<scenario>
<world>
<region name="USA">
<AgSupplySector name="Corn" nocreate="1">
<AgSupplySubsector name="Corn_NelsonR" nocreate="1">
<AgProductionTechnology name="Corn_NelsonR" nocreate="1">
<period year="1975">
<Non-CO2 name="SO2_1_AWB">
<input-emissions>3.98749e-05</input-emissions>
<output-driver/>
<gdp-control name="GDP_control">
<max-reduction>60</max-reduction>
<steepness>3.5</steepness>
</gdp-control>
</Non-CO2>
<Non-CO2 name="NOx_AWB">
<input-emissions>0.000285263</input-emissions>
<output-driver/>
<gdp-control name="GDP_control">
<max-reduction>60</max-reduction>
<steepness>3.5</steepness>
</gdp-control>
</Non-CO2>
</period>
</AgProductionTechnology>
</AgSupplySubsector>
</AgSupplySector>
</region>
</world>
</scenario>
预期输出如下:
我正在尝试使用以下代码解析它:
import os
import xml.etree.cElementTree as etree
import codecs
import csv
PATH = 'D:\Book1'
FILENAME_BIO = 'Test.csv'
FILENAME_XML = 'all_aglu_emissions.xml'
ENCODING = "utf-8"
pathBIO = os.path.join(PATH, FILENAME_BIO)
pathXML = os.path.join(PATH, FILENAME_XML)
with codecs.open(pathBIO, "w", ENCODING) as bioFH:
bioWriter = csv.writer(bioFH, quoting=csv.QUOTE_MINIMAL)
bioWriter.writerow(['Year','Gas', 'Value','Technology','Crop','Country'])
for event, elem in etree.iterparse(pathXML, events=('start','end')):
if event == 'start' and elem.tag == 'region':
str1 = elem.attrib['name']
elif event == 'start' and elem.tag == 'AgSupplySector':
str2 = elem.attrib['name']
elif event == 'start' and elem.tag == 'AgProductionTechnology':
str3 = elem.attrib['name']
elif event == 'start' and elem.tag == 'period':
str4 = elem.attrib['year']
elif event == 'start' and elem.tag == 'Non-CO2':
str5 = elem.attrib['name']
elif event == 'end' and elem.tag == 'input-emissions':
for em in elem.iter('input-emissions'):
str6 = em.text
bioWriter.writerow([str4, str5, str6, str3, str2, str1])
elem.clear()
我的问题是 str6 有更多带有空字段的额外行。可能,我在这里有嵌套问题。请帮忙。
错误示例(出现 0 个字段):
for em in elem.iter('input-emissions')
循环没有用,放弃它。
import os
import xml.etree.ElementTree as etree
import csv
PATH = '.'
FILENAME_BIO = 'Test.csv'
FILENAME_XML = 'all_aglu_emissions.xml'
pathBIO = os.path.join(PATH, FILENAME_BIO)
pathXML = os.path.join(PATH, FILENAME_XML)
with open(pathBIO, 'w', encoding='utf8', newline='') as bioFH:
bioWriter = csv.writer(bioFH, quoting=csv.QUOTE_MINIMAL)
bioWriter.writerow('Year Gas Value Technology Crop Country'.split())
for event, elem in etree.iterparse(pathXML, events=('start',)):
if elem.tag == 'region':
str1 = elem.attrib['name']
elif elem.tag == 'AgSupplySector':
str2 = elem.attrib['name']
elif elem.tag == 'AgProductionTechnology':
str3 = elem.attrib['name']
elif elem.tag == 'period':
str4 = elem.attrib['year']
elif elem.tag == 'Non-CO2':
str5 = elem.attrib['name']
elif elem.tag == 'input-emissions':
str6 = elem.text
bioWriter.writerow([str4, str5, str6, str3, str2, str1])
elem.clear()
我对代码做了一些其他细微的更改,因为我假设您为此使用 Python 3。它们包括使用 xml.etree.ElementTree
而不是过时的 xml.etree.cElementTree
,跳过 codecs
模块(Python 3 可以在本地执行此操作)并将 newline=''
参数传递给 open()
调用,因此 csv
模块可以自行正确处理换行符。
由于监听 start
事件足以达到预期效果,因此我完全放弃了处理 end
事件。
结果是
Year,Gas,Value,Technology,Crop,Country
1975,SO2_1_AWB,3.98749e-05,Corn_NelsonR,Corn,USA
1975,NOx_AWB,0.000285263,Corn_NelsonR,Corn,USA
我正在尝试从这个 tutorial 中复制示例,但是将 iterparse 与 elem.clear() 结合使用。
XML 示例:
<?xml version="1.0" encoding="UTF-8"?>
<scenario>
<world>
<region name="USA">
<AgSupplySector name="Corn" nocreate="1">
<AgSupplySubsector name="Corn_NelsonR" nocreate="1">
<AgProductionTechnology name="Corn_NelsonR" nocreate="1">
<period year="1975">
<Non-CO2 name="SO2_1_AWB">
<input-emissions>3.98749e-05</input-emissions>
<output-driver/>
<gdp-control name="GDP_control">
<max-reduction>60</max-reduction>
<steepness>3.5</steepness>
</gdp-control>
</Non-CO2>
<Non-CO2 name="NOx_AWB">
<input-emissions>0.000285263</input-emissions>
<output-driver/>
<gdp-control name="GDP_control">
<max-reduction>60</max-reduction>
<steepness>3.5</steepness>
</gdp-control>
</Non-CO2>
</period>
</AgProductionTechnology>
</AgSupplySubsector>
</AgSupplySector>
</region>
</world>
</scenario>
预期输出如下:
import os
import xml.etree.cElementTree as etree
import codecs
import csv
PATH = 'D:\Book1'
FILENAME_BIO = 'Test.csv'
FILENAME_XML = 'all_aglu_emissions.xml'
ENCODING = "utf-8"
pathBIO = os.path.join(PATH, FILENAME_BIO)
pathXML = os.path.join(PATH, FILENAME_XML)
with codecs.open(pathBIO, "w", ENCODING) as bioFH:
bioWriter = csv.writer(bioFH, quoting=csv.QUOTE_MINIMAL)
bioWriter.writerow(['Year','Gas', 'Value','Technology','Crop','Country'])
for event, elem in etree.iterparse(pathXML, events=('start','end')):
if event == 'start' and elem.tag == 'region':
str1 = elem.attrib['name']
elif event == 'start' and elem.tag == 'AgSupplySector':
str2 = elem.attrib['name']
elif event == 'start' and elem.tag == 'AgProductionTechnology':
str3 = elem.attrib['name']
elif event == 'start' and elem.tag == 'period':
str4 = elem.attrib['year']
elif event == 'start' and elem.tag == 'Non-CO2':
str5 = elem.attrib['name']
elif event == 'end' and elem.tag == 'input-emissions':
for em in elem.iter('input-emissions'):
str6 = em.text
bioWriter.writerow([str4, str5, str6, str3, str2, str1])
elem.clear()
我的问题是 str6 有更多带有空字段的额外行。可能,我在这里有嵌套问题。请帮忙。
错误示例(出现 0 个字段):
for em in elem.iter('input-emissions')
循环没有用,放弃它。
import os
import xml.etree.ElementTree as etree
import csv
PATH = '.'
FILENAME_BIO = 'Test.csv'
FILENAME_XML = 'all_aglu_emissions.xml'
pathBIO = os.path.join(PATH, FILENAME_BIO)
pathXML = os.path.join(PATH, FILENAME_XML)
with open(pathBIO, 'w', encoding='utf8', newline='') as bioFH:
bioWriter = csv.writer(bioFH, quoting=csv.QUOTE_MINIMAL)
bioWriter.writerow('Year Gas Value Technology Crop Country'.split())
for event, elem in etree.iterparse(pathXML, events=('start',)):
if elem.tag == 'region':
str1 = elem.attrib['name']
elif elem.tag == 'AgSupplySector':
str2 = elem.attrib['name']
elif elem.tag == 'AgProductionTechnology':
str3 = elem.attrib['name']
elif elem.tag == 'period':
str4 = elem.attrib['year']
elif elem.tag == 'Non-CO2':
str5 = elem.attrib['name']
elif elem.tag == 'input-emissions':
str6 = elem.text
bioWriter.writerow([str4, str5, str6, str3, str2, str1])
elem.clear()
我对代码做了一些其他细微的更改,因为我假设您为此使用 Python 3。它们包括使用 xml.etree.ElementTree
而不是过时的 xml.etree.cElementTree
,跳过 codecs
模块(Python 3 可以在本地执行此操作)并将 newline=''
参数传递给 open()
调用,因此 csv
模块可以自行正确处理换行符。
由于监听 start
事件足以达到预期效果,因此我完全放弃了处理 end
事件。
结果是
Year,Gas,Value,Technology,Crop,Country 1975,SO2_1_AWB,3.98749e-05,Corn_NelsonR,Corn,USA 1975,NOx_AWB,0.000285263,Corn_NelsonR,Corn,USA