使用 Python 解析复杂的 XML 文件
Parsing a complicated XML file with Python
我正在尝试用 Python 解析一个非常难看的 XML 文件。我设法很好地了解它,但在 npdoc 元素上它失败了。我做错了什么?
XML:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<npexchange xmlns="http://www.example.com/npexchange/3.5" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="3.5">
<article id="123" refType="Article">
<articleparts>
<articlepart id="1234" refType="ArticlePart">
<data>
<npdoc xmlns="http://www.example.com/npdoc/2.1" version="2.1" xml:lang="sv_SE">
<body>
<p>Lorem ipsum some random text here.</p>
<p>
<b>Yes this is HTML markup, and I would like to keep that.</b>
</p>
</body>
<headline>
<p>I am a headline</p>
</headline>
<leadin>
<p>I am some other text</p>
</leadin>
</npdoc>
</data>
</articlepart>
</articleparts>
</article>
</npexchange>
这是我目前的 python 代码:
from xml.etree.ElementTree import ElementTree
def parse(self):
tree = ElementTree(file=filename)
for item in tree.iter("article"):
articleParts = item.find("articleparts")
for articlepart in articleParts.iter("articlepart"):
data = articlepart.find("data")
npdoc = data.find("npdoc")
id = item.get("id")
headline = npdoc.find("headline").text
leadIn = npdoc.find("leadin").text
body = npdoc.find("body").text
return articles
我得到了 id,但是我无法访问 npdoc 元素中的字段。 npdoc 变量设置为 None.
更新:
通过在 .find() 调用中使用命名空间设法将元素放入变量中。我如何获得价值?因为它是 HTML,所以它无法正确显示 .text 属性。
这是我在 Python 3.4 中想到的。它当然不是防弹的,但它可能会给你一些想法。
import xml.etree.ElementTree as ET
tree = ET.parse(r'C:\Users\Gord\Desktop\nasty.xml')
npexchange = tree.getroot()
for article in npexchange:
for articleparts in article:
for articlepart in articleparts:
id = articlepart.attrib['id']
print("ArticlePart - id: {0}".format(id))
for data in articlepart:
for npdoc in data:
for child in npdoc:
tag = child.tag[child.tag.find('}')+1:]
print(" {0}:".format(tag)) ## e.g., "body:"
contents = ET.tostring(child).decode('utf-8')
contents = contents.replace('<ns0:', '<')
contents = contents.replace('</ns0:', '</')
contents = contents.replace(' xmlns:ns0="http://www.example.com/npdoc/2.1">', '>')
contents = contents.replace('<' + tag + '>\n', '')
contents = contents.replace('</' + tag + '>', '')
contents = contents.strip()
print(" {0}".format(contents))
控制台输出为
ArticlePart - id: 1234
body:
<p>Lorem ipsum some random text here.</p>
<p>
<b>Yes this is HTML markup, and I would like to keep that.</b>
</p>
headline:
<p>I am a headline</p>
leadin:
<p>I am some other text</p>
更新
带有
的稍微改进的版本
- 命名空间映射(由 Charles 建议),
register_namespace
带有空前缀以删除一些命名空间前缀 "noise",以及
- 使用
.findall()
而不是盲目遍历子节点而不考虑它们的标签:
import xml.etree.ElementTree as ET
npdoc_uri = 'http://www.example.com/npdoc/2.1'
nsmap = {
'npexchange': 'http://www.example.com/npexchange/3.5',
'npdoc': npdoc_uri
}
ET.register_namespace("", npdoc_uri)
tree = ET.parse(r'/home/gord/Desktop/nasty.xml')
npexchange = tree.getroot()
for article in npexchange.findall('npexchange:article', nsmap):
for articleparts in article.findall('npexchange:articleparts', nsmap):
for articlepart in articleparts.findall('npexchange:articlepart', nsmap):
id = articlepart.attrib['id']
print("ArticlePart - id: {0}".format(id))
for data in articlepart.findall('npexchange:data', nsmap):
for npdoc in data.findall('npdoc:npdoc', nsmap):
for child in npdoc.getchildren():
tag = child.tag[child.tag.find('}')+1:]
print(" {0}:".format(tag)) ## e.g., "body:"
contents = ET.tostring(child).decode('utf-8')
# remove HTML block tags, e.g. <body ...> and </body>
contents = contents.replace('<' + tag + ' xmlns="' + npdoc_uri + '">\n', '')
contents = contents.replace('</' + tag + '>', '')
contents = contents.strip()
print(" {0}".format(contents))
nsmap = {'npdoc': 'http://www.example.com/npdoc/2.1'}
data = articlepart.find("npdoc:data", namespaces=nsmap)
...将找到您的 data
元素。不需要丑陋、不可靠的字符串修改。 (回复:"unreliable" -- 考虑这会对包含文字箭头括号的 CDATA 部分产生什么影响)。
我正在尝试用 Python 解析一个非常难看的 XML 文件。我设法很好地了解它,但在 npdoc 元素上它失败了。我做错了什么?
XML:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<npexchange xmlns="http://www.example.com/npexchange/3.5" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="3.5">
<article id="123" refType="Article">
<articleparts>
<articlepart id="1234" refType="ArticlePart">
<data>
<npdoc xmlns="http://www.example.com/npdoc/2.1" version="2.1" xml:lang="sv_SE">
<body>
<p>Lorem ipsum some random text here.</p>
<p>
<b>Yes this is HTML markup, and I would like to keep that.</b>
</p>
</body>
<headline>
<p>I am a headline</p>
</headline>
<leadin>
<p>I am some other text</p>
</leadin>
</npdoc>
</data>
</articlepart>
</articleparts>
</article>
</npexchange>
这是我目前的 python 代码:
from xml.etree.ElementTree import ElementTree
def parse(self):
tree = ElementTree(file=filename)
for item in tree.iter("article"):
articleParts = item.find("articleparts")
for articlepart in articleParts.iter("articlepart"):
data = articlepart.find("data")
npdoc = data.find("npdoc")
id = item.get("id")
headline = npdoc.find("headline").text
leadIn = npdoc.find("leadin").text
body = npdoc.find("body").text
return articles
我得到了 id,但是我无法访问 npdoc 元素中的字段。 npdoc 变量设置为 None.
更新: 通过在 .find() 调用中使用命名空间设法将元素放入变量中。我如何获得价值?因为它是 HTML,所以它无法正确显示 .text 属性。
这是我在 Python 3.4 中想到的。它当然不是防弹的,但它可能会给你一些想法。
import xml.etree.ElementTree as ET
tree = ET.parse(r'C:\Users\Gord\Desktop\nasty.xml')
npexchange = tree.getroot()
for article in npexchange:
for articleparts in article:
for articlepart in articleparts:
id = articlepart.attrib['id']
print("ArticlePart - id: {0}".format(id))
for data in articlepart:
for npdoc in data:
for child in npdoc:
tag = child.tag[child.tag.find('}')+1:]
print(" {0}:".format(tag)) ## e.g., "body:"
contents = ET.tostring(child).decode('utf-8')
contents = contents.replace('<ns0:', '<')
contents = contents.replace('</ns0:', '</')
contents = contents.replace(' xmlns:ns0="http://www.example.com/npdoc/2.1">', '>')
contents = contents.replace('<' + tag + '>\n', '')
contents = contents.replace('</' + tag + '>', '')
contents = contents.strip()
print(" {0}".format(contents))
控制台输出为
ArticlePart - id: 1234
body:
<p>Lorem ipsum some random text here.</p>
<p>
<b>Yes this is HTML markup, and I would like to keep that.</b>
</p>
headline:
<p>I am a headline</p>
leadin:
<p>I am some other text</p>
更新
带有
的稍微改进的版本- 命名空间映射(由 Charles 建议),
register_namespace
带有空前缀以删除一些命名空间前缀 "noise",以及- 使用
.findall()
而不是盲目遍历子节点而不考虑它们的标签:
import xml.etree.ElementTree as ET
npdoc_uri = 'http://www.example.com/npdoc/2.1'
nsmap = {
'npexchange': 'http://www.example.com/npexchange/3.5',
'npdoc': npdoc_uri
}
ET.register_namespace("", npdoc_uri)
tree = ET.parse(r'/home/gord/Desktop/nasty.xml')
npexchange = tree.getroot()
for article in npexchange.findall('npexchange:article', nsmap):
for articleparts in article.findall('npexchange:articleparts', nsmap):
for articlepart in articleparts.findall('npexchange:articlepart', nsmap):
id = articlepart.attrib['id']
print("ArticlePart - id: {0}".format(id))
for data in articlepart.findall('npexchange:data', nsmap):
for npdoc in data.findall('npdoc:npdoc', nsmap):
for child in npdoc.getchildren():
tag = child.tag[child.tag.find('}')+1:]
print(" {0}:".format(tag)) ## e.g., "body:"
contents = ET.tostring(child).decode('utf-8')
# remove HTML block tags, e.g. <body ...> and </body>
contents = contents.replace('<' + tag + ' xmlns="' + npdoc_uri + '">\n', '')
contents = contents.replace('</' + tag + '>', '')
contents = contents.strip()
print(" {0}".format(contents))
nsmap = {'npdoc': 'http://www.example.com/npdoc/2.1'}
data = articlepart.find("npdoc:data", namespaces=nsmap)
...将找到您的 data
元素。不需要丑陋、不可靠的字符串修改。 (回复:"unreliable" -- 考虑这会对包含文字箭头括号的 CDATA 部分产生什么影响)。