iterparse 大 XML 使用 python
iterparse large XML using python
这让我抓狂了一整天,如果能帮我分析一个大 XML 文件,我将不胜感激...
文件包含超过 900,000 行,并以 gzip 格式下载,我确实有一些工作可以使用数据提取物进行测试并使用 minidom 进行解析,但这只是不会将其剪切为完整文件,所以我正在查看 iterparse,但我无法让任何示例正常工作,甚至到了无法导入错误的地步....我唯一可以开始工作的导入是 import xml.eTree.cElementTree 但这似乎几乎无法与我找到的大多数代码示例一起使用
我确实有一件事情正在接近 iterparse 和 cElementTree
def buildit(file):
print file
#with open(file) as line:
#print line
for event, elem in et.iterparse(file):
with open(file, "r") as line:
for event, elem in et.iterparse(file):
print elem.tag
if event =='end' and elem.tag=='Journey':
print elem.tag
<?xml version="1.0" encoding="utf-8"?>
<PportTimetable xmlns:xsd="" xmlns:xsi="" timetableID="20160421020832" xmlns="">
<Journey rid="201604211191598" uid="G61365" trainId="1T02" ssd="2016-04-21" toc="SR" trainCat="XX">
<OR tpl="PERTH" act="TBK " plat="3" ptd="05:18" wtd="05:18" />
<PP tpl="HILTONJ" wtp="05:22" />
<IP tpl="GLNEGLS" act="T " plat="1" pta="05:33" ptd="05:33" wta="05:32:30" wtd="05:33:30" />
<PP tpl="BLFD" wtp="05:37:30" />
<IP tpl="DUNANE" act="T " plat="1" pta="05:45" ptd="05:46" wta="05:45" wtd="05:46" />
<IP tpl="BGOALAN" act="T " plat="1" pta="05:49" ptd="05:49" wta="05:49" wtd="05:49:30" />
<IP tpl="STIRLNG" act="T K " plat="3" pta="05:53" ptd="05:54" wta="05:53" wtd="05:54" />
<IP tpl="LARBERT" act="T " plat="1" pta="06:03" ptd="06:03" wta="06:02:30" wtd="06:03" />
<PP tpl="LARBERJ" wtp="06:04:30" />
<PP tpl="CRMRSWJ" wtp="06:05" />
<PP tpl="GNHLLJN" wtp="06:09" />
<OPIP tpl="CMBRNLD" act="C N " plat="1" wta="06:22" wtd="06:24" />
<PP tpl="GRNQNNJ" wtp="06:30" />
<PP tpl="GSHRSJN" wtp="06:33" />
<PP tpl="COATBDC" wtp="06:36:30" />
<PP tpl="LGLNJN" wtp="06:38" />
<PP tpl="CARMYLE" plat="1" wtp="06:49" />
<PP tpl="RTHGNEJ" wtp="06:53:30" />
<PP tpl="SHFD" wtp="06:56" />
<PP tpl="LRKFLDJ" wtp="06:59" />
<PP tpl="EGLNSTJ" wtp="07:01:30" />
<PP tpl="GLGCBSJ" wtp="07:02:30" />
<DT tpl="GLGC" act="TF" pta="07:05" wta="07:05" />
<Journey rid="201604211192476" uid="G64015" trainId="2N41" ssd="2016-04-21" toc="SR">
<OR tpl="GLGQLL" act="TB" plat="8" ptd="06:20" wtd="06:20" />
<PP tpl="FNSTNEJ" wtp="06:23:30" />
<PP tpl="HYNDLEJ" wtp="06:28:30" />
<OPIP tpl="ANSL" act="A N " plat="2" wta="06:30" wtd="06:30:30" />
<PP tpl="MRYHILL" wtp="06:33" />
<PP tpl="CWLRSNJ" wtp="06:48" />
<PP tpl="CWLRSEJ" wtp="06:49" />
<IP tpl="BSHB" act="T " plat="1" pta="06:52" ptd="06:54" wta="06:52" wtd="06:54" />
<IP tpl="LENZIE" act="T " plat="1" pta="06:59" ptd="06:59" wta="06:58:30" wtd="06:59:30" />
<IP tpl="CROY" act="T " plat="1" pta="07:06" ptd="07:06" wta="07:05:30" wtd="07:06:30" />
<PP tpl="GNHLUJN" wtp="07:12:30" />
<PP tpl="GNHLLJN" wtp="07:15" />
<PP tpl="CRMRSWJ" wtp="07:17" />
<PP tpl="LARBERJ" wtp="07:19:30" />
<IP tpl="LARBERT" act="T " plat="2" pta="07:21" ptd="07:21" wta="07:20:30" wtd="07:21" />
<IP tpl="STIRLNG" act="T " plat="6" pta="07:30" ptd="07:41" wta="07:29:30" wtd="07:41" />
<IP tpl="BGOALAN" act="T " plat="2" pta="07:45" ptd="07:45" wta="07:45" wtd="07:45:30" />
<DT tpl="DUNANE" act="TF" plat="DPV" pta="07:52" wta="07:52" />
这是一个工作程序,说明如何使用 cElementTree
中的 .iterparse()
,将结果存储在数据库中。请注意,此程序知道输入 XML.
与问题中给出的示例 XML 相同。
# Tested on Python 2.6.7, Ubuntu 14.04.4
import xml.etree.cElementTree as et
import sqlite3
# Tools to deal with namespaces
ixid_uri = ''
def extract_local_tag(qname):
return qname.split('}')[-1]
# A db connection to illustrate the example
conn = sqlite3.connect(":memory:")
c = conn.cursor()
c.execute("create table foo (joury_uid text, tag text, tpl text)")
# The main part of the code: iterate over the XML,
# storing DB stuff at the end of every <Journey>
with open('i.xml') as xml_file:
for event, elem in et.iterparse(xml_file):
# Must compare tag to qualified name
if elem.tag == et.QName(ixid_uri, 'Journey'):
c.executemany('insert into foo values(?, ?, ?)',
child.attrib.get('tpl', None))
for child in elem
# Note: only clears <Journey> elements and their children.
# There is a memory leak of any elements not children of <Journey>
for row in c.execute('select * from foo'):
print row
(u'G61365', u'OR', u'PERTH')
(u'G61365', u'PP', u'HILTONJ')
(u'G61365', u'DT', u'GLGC')
(u'G64015', u'OR', u'GLGQLL')
(u'G64015', u'PP', u'FNSTNEJ')
这让我抓狂了一整天,如果能帮我分析一个大 XML 文件,我将不胜感激...
文件包含超过 900,000 行,并以 gzip 格式下载,我确实有一些工作可以使用数据提取物进行测试并使用 minidom 进行解析,但这只是不会将其剪切为完整文件,所以我正在查看 iterparse,但我无法让任何示例正常工作,甚至到了无法导入错误的地步....我唯一可以开始工作的导入是 import xml.eTree.cElementTree 但这似乎几乎无法与我找到的大多数代码示例一起使用
我确实有一件事情正在接近 iterparse 和 cElementTree
def buildit(file):
print file
#with open(file) as line:
#print line
for event, elem in et.iterparse(file):
with open(file, "r") as line:
for event, elem in et.iterparse(file):
print elem.tag
if event =='end' and elem.tag=='Journey':
print elem.tag
<?xml version="1.0" encoding="utf-8"?>
<PportTimetable xmlns:xsd="" xmlns:xsi="" timetableID="20160421020832" xmlns="">
<Journey rid="201604211191598" uid="G61365" trainId="1T02" ssd="2016-04-21" toc="SR" trainCat="XX">
<OR tpl="PERTH" act="TBK " plat="3" ptd="05:18" wtd="05:18" />
<PP tpl="HILTONJ" wtp="05:22" />
<IP tpl="GLNEGLS" act="T " plat="1" pta="05:33" ptd="05:33" wta="05:32:30" wtd="05:33:30" />
<PP tpl="BLFD" wtp="05:37:30" />
<IP tpl="DUNANE" act="T " plat="1" pta="05:45" ptd="05:46" wta="05:45" wtd="05:46" />
<IP tpl="BGOALAN" act="T " plat="1" pta="05:49" ptd="05:49" wta="05:49" wtd="05:49:30" />
<IP tpl="STIRLNG" act="T K " plat="3" pta="05:53" ptd="05:54" wta="05:53" wtd="05:54" />
<IP tpl="LARBERT" act="T " plat="1" pta="06:03" ptd="06:03" wta="06:02:30" wtd="06:03" />
<PP tpl="LARBERJ" wtp="06:04:30" />
<PP tpl="CRMRSWJ" wtp="06:05" />
<PP tpl="GNHLLJN" wtp="06:09" />
<OPIP tpl="CMBRNLD" act="C N " plat="1" wta="06:22" wtd="06:24" />
<PP tpl="GRNQNNJ" wtp="06:30" />
<PP tpl="GSHRSJN" wtp="06:33" />
<PP tpl="COATBDC" wtp="06:36:30" />
<PP tpl="LGLNJN" wtp="06:38" />
<PP tpl="CARMYLE" plat="1" wtp="06:49" />
<PP tpl="RTHGNEJ" wtp="06:53:30" />
<PP tpl="SHFD" wtp="06:56" />
<PP tpl="LRKFLDJ" wtp="06:59" />
<PP tpl="EGLNSTJ" wtp="07:01:30" />
<PP tpl="GLGCBSJ" wtp="07:02:30" />
<DT tpl="GLGC" act="TF" pta="07:05" wta="07:05" />
<Journey rid="201604211192476" uid="G64015" trainId="2N41" ssd="2016-04-21" toc="SR">
<OR tpl="GLGQLL" act="TB" plat="8" ptd="06:20" wtd="06:20" />
<PP tpl="FNSTNEJ" wtp="06:23:30" />
<PP tpl="HYNDLEJ" wtp="06:28:30" />
<OPIP tpl="ANSL" act="A N " plat="2" wta="06:30" wtd="06:30:30" />
<PP tpl="MRYHILL" wtp="06:33" />
<PP tpl="CWLRSNJ" wtp="06:48" />
<PP tpl="CWLRSEJ" wtp="06:49" />
<IP tpl="BSHB" act="T " plat="1" pta="06:52" ptd="06:54" wta="06:52" wtd="06:54" />
<IP tpl="LENZIE" act="T " plat="1" pta="06:59" ptd="06:59" wta="06:58:30" wtd="06:59:30" />
<IP tpl="CROY" act="T " plat="1" pta="07:06" ptd="07:06" wta="07:05:30" wtd="07:06:30" />
<PP tpl="GNHLUJN" wtp="07:12:30" />
<PP tpl="GNHLLJN" wtp="07:15" />
<PP tpl="CRMRSWJ" wtp="07:17" />
<PP tpl="LARBERJ" wtp="07:19:30" />
<IP tpl="LARBERT" act="T " plat="2" pta="07:21" ptd="07:21" wta="07:20:30" wtd="07:21" />
<IP tpl="STIRLNG" act="T " plat="6" pta="07:30" ptd="07:41" wta="07:29:30" wtd="07:41" />
<IP tpl="BGOALAN" act="T " plat="2" pta="07:45" ptd="07:45" wta="07:45" wtd="07:45:30" />
<DT tpl="DUNANE" act="TF" plat="DPV" pta="07:52" wta="07:52" />
这是一个工作程序,说明如何使用 cElementTree
中的 .iterparse()
,将结果存储在数据库中。请注意,此程序知道输入 XML.
与问题中给出的示例 XML 相同。
# Tested on Python 2.6.7, Ubuntu 14.04.4
import xml.etree.cElementTree as et
import sqlite3
# Tools to deal with namespaces
ixid_uri = ''
def extract_local_tag(qname):
return qname.split('}')[-1]
# A db connection to illustrate the example
conn = sqlite3.connect(":memory:")
c = conn.cursor()
c.execute("create table foo (joury_uid text, tag text, tpl text)")
# The main part of the code: iterate over the XML,
# storing DB stuff at the end of every <Journey>
with open('i.xml') as xml_file:
for event, elem in et.iterparse(xml_file):
# Must compare tag to qualified name
if elem.tag == et.QName(ixid_uri, 'Journey'):
c.executemany('insert into foo values(?, ?, ?)',
child.attrib.get('tpl', None))
for child in elem
# Note: only clears <Journey> elements and their children.
# There is a memory leak of any elements not children of <Journey>
for row in c.execute('select * from foo'):
print row
(u'G61365', u'OR', u'PERTH')
(u'G61365', u'PP', u'HILTONJ')
(u'G61365', u'DT', u'GLGC')
(u'G64015', u'OR', u'GLGQLL')
(u'G64015', u'PP', u'FNSTNEJ')