Python ElementTree 筛选子项中的类别 xml 使用 findall 列出
Python ElementTree filter categories within children xml list with findall
我想使用 European Transparancy Register 和 Python,但是,到目前为止我遇到了一些麻烦。您将在下面找到一个非常简化的 XML 版本(通常包含 160 万行)。
我想检索在 interests
.
中勾选类别 Digital economy and society
的所有公司
但是,我很难将 ElementTree
的 findall()
与这个非常特殊的 XML 结构一起使用。您可以在下面找到我的代码摘录,其中包含我能够获得的第一个结果。过滤器 returns 没有。
<?xml version='1.0' encoding='UTF-8'?>
<ListOfIRPublicDetail xmlns:ns2="http://www.w3.org/1999/xlink" xmlns="http://intragate.ec.europa.eu/transparencyregister/intws/20200626">
<metaData>
<exportDate>2021-01-21T12:20:00.122+01:00</exportDate>
<numberOfIR>12205</numberOfIR>
</metaData>
<resultList>
<interestRepresentative>
<name>
<originalName>F. Hoffmann-La Roche Ltd</originalName>
</name>
<interests>
<interest index="1">
<name>Budget</name>
</interest>
<interest index="2">
<name>Business and Industry</name>
</interest>
<interest index="3">
<name>Climate Action</name>
</interest>
<interest index="4">
<name>Competition</name>
</interest>
<interest index="5">
<name>Consumers</name>
</interest>
<interest index="6">
<name>Digital economy and society</name>
</interest>
<interest index="7">
<name>Economy, finance and the euro</name>
</interest>
<interest index="8">
<name>Environment</name>
</interest>
<interest index="9">
<name>External Relations</name>
</interest>
<interest index="10">
<name>Institutional affairs</name>
</interest>
<interest index="11">
<name>International co-operation and development</name>
</interest>
<interest index="12">
<name>Justice and Fundamental Rights</name>
</interest>
<interest index="13">
<name>Public Health</name>
</interest>
<interest index="14">
<name>Research and innovation</name>
</interest>
<interest index="15">
<name>Single market</name>
</interest>
<interest index="16">
<name>Trade</name>
</interest>
</interests>
</interestRepresentative>
<interestRepresentative>
<name>
<originalName>Nickel Institute</originalName>
</name>
<interests>
<interest index="1">
<name>Business and Industry</name>
</interest>
<interest index="2">
<name>Climate Action</name>
</interest>
<interest index="3">
<name>Consumers</name>
</interest>
<interest index="4">
<name>Economy, finance and the euro</name>
</interest>
<interest index="5">
<name>Employment and Social Affairs</name>
</interest>
<interest index="6">
<name>Energy</name>
</interest>
<interest index="7">
<name>Environment</name>
</interest>
<interest index="8">
<name>Food Safety</name>
</interest>
<interest index="9">
<name>Public Health</name>
</interest>
<interest index="10">
<name>Research and innovation</name>
</interest>
<interest index="11">
<name>Single market</name>
</interest>
<interest index="12">
<name>Trade</name>
</interest>
<interest index="13">
<name>Transport</name>
</interest>
</interests>
</interestRepresentative>
</resultList>
</ListOfIRPublicDetail>
try:
register_tree = ET.parse(REGISTER_XML)
register_root = register_tree.getroot()
except:
sys.exit("""⚠️ Impossible d'ouvrir le registre. \n\n""")
# Get all companies name
for xml in register_root.findall(".//{*}resultList/"):
print(xml.find("{*}name/{*}originalName").text)
print('\n=============\n')
# Get categories of interest
for xml in register_root.findall(".//{*}resultList/"):
for child in xml.findall("{*}interests/{*}interest/{*}name"):
print(child.text)
print('\n')
print('\n=============\n')
# Filter categories
for xml in register_root.findall(".//{*}resultList/{*}interests/{*}interest/[{*}name='Digital economy and society']"):
print(xml)
F. Hoffmann-La Roche Ltd
Nickel Institute
=============
Budget
Business and Industry
Climate Action
Competition
Consumers
Digital economy and society
Economy, finance and the euro
Environment
External Relations
Institutional affairs
International co-operation and development
Justice and Fundamental Rights
Public Health
Research and innovation
Single market
Trade
Business and Industry
Climate Action
Consumers
Economy, finance and the euro
Employment and Social Affairs
Energy
Environment
Food Safety
Public Health
Research and innovation
Single market
Trade
Transport
=============
在 this post 的帮助下...
from xml.etree import ElementTree as ET
from io import StringIO
data = '''\
<?xml version='1.0' encoding='UTF-8'?>
<ListOfIRPublicDetail xmlns:ns2="http://www.w3.org/1999/xlink" xmlns="http://intragate.ec.europa.eu/transparencyregister/intws/20200626">
<metaData>
<exportDate>2021-01-21T12:20:00.122+01:00</exportDate>
<numberOfIR>12205</numberOfIR>
</metaData>
<resultList>
<interestRepresentative>
<name>
<originalName>F. Hoffmann-La Roche Ltd</originalName>
</name>
<interests>
<interest index="1">
<name>Budget</name>
</interest>
<interest index="2">
<name>Business and Industry</name>
</interest>
<interest index="3">
<name>Climate Action</name>
</interest>
<interest index="4">
<name>Competition</name>
</interest>
<interest index="5">
<name>Consumers</name>
</interest>
<interest index="6">
<name>Digital economy and society</name>
</interest>
<interest index="7">
<name>Economy, finance and the euro</name>
</interest>
<interest index="8">
<name>Environment</name>
</interest>
<interest index="9">
<name>External Relations</name>
</interest>
<interest index="10">
<name>Institutional affairs</name>
</interest>
<interest index="11">
<name>International co-operation and development</name>
</interest>
<interest index="12">
<name>Justice and Fundamental Rights</name>
</interest>
<interest index="13">
<name>Public Health</name>
</interest>
<interest index="14">
<name>Research and innovation</name>
</interest>
<interest index="15">
<name>Single market</name>
</interest>
<interest index="16">
<name>Trade</name>
</interest>
</interests>
</interestRepresentative>
<interestRepresentative>
<name>
<originalName>Nickel Institute</originalName>
</name>
<interests>
<interest index="1">
<name>Business and Industry</name>
</interest>
<interest index="2">
<name>Climate Action</name>
</interest>
<interest index="3">
<name>Consumers</name>
</interest>
<interest index="4">
<name>Economy, finance and the euro</name>
</interest>
<interest index="5">
<name>Employment and Social Affairs</name>
</interest>
<interest index="6">
<name>Energy</name>
</interest>
<interest index="7">
<name>Environment</name>
</interest>
<interest index="8">
<name>Food Safety</name>
</interest>
<interest index="9">
<name>Public Health</name>
</interest>
<interest index="10">
<name>Research and innovation</name>
</interest>
<interest index="11">
<name>Single market</name>
</interest>
<interest index="12">
<name>Trade</name>
</interest>
<interest index="13">
<name>Transport</name>
</interest>
</interests>
</interestRepresentative>
</resultList>
</ListOfIRPublicDetail>
'''
f = StringIO(data)
tree = ET.parse(f)
ns = {'ns': 'http://intragate.ec.europa.eu/transparencyregister/intws/20200626'}
for e in tree.findall('.//*[.="Digital economy and society"]../../../ns:name/ns:originalName', namespaces=ns):
print(e.text)
我想使用 European Transparancy Register 和 Python,但是,到目前为止我遇到了一些麻烦。您将在下面找到一个非常简化的 XML 版本(通常包含 160 万行)。
我想检索在 interests
.
Digital economy and society
的所有公司
但是,我很难将 ElementTree
的 findall()
与这个非常特殊的 XML 结构一起使用。您可以在下面找到我的代码摘录,其中包含我能够获得的第一个结果。过滤器 returns 没有。
<?xml version='1.0' encoding='UTF-8'?>
<ListOfIRPublicDetail xmlns:ns2="http://www.w3.org/1999/xlink" xmlns="http://intragate.ec.europa.eu/transparencyregister/intws/20200626">
<metaData>
<exportDate>2021-01-21T12:20:00.122+01:00</exportDate>
<numberOfIR>12205</numberOfIR>
</metaData>
<resultList>
<interestRepresentative>
<name>
<originalName>F. Hoffmann-La Roche Ltd</originalName>
</name>
<interests>
<interest index="1">
<name>Budget</name>
</interest>
<interest index="2">
<name>Business and Industry</name>
</interest>
<interest index="3">
<name>Climate Action</name>
</interest>
<interest index="4">
<name>Competition</name>
</interest>
<interest index="5">
<name>Consumers</name>
</interest>
<interest index="6">
<name>Digital economy and society</name>
</interest>
<interest index="7">
<name>Economy, finance and the euro</name>
</interest>
<interest index="8">
<name>Environment</name>
</interest>
<interest index="9">
<name>External Relations</name>
</interest>
<interest index="10">
<name>Institutional affairs</name>
</interest>
<interest index="11">
<name>International co-operation and development</name>
</interest>
<interest index="12">
<name>Justice and Fundamental Rights</name>
</interest>
<interest index="13">
<name>Public Health</name>
</interest>
<interest index="14">
<name>Research and innovation</name>
</interest>
<interest index="15">
<name>Single market</name>
</interest>
<interest index="16">
<name>Trade</name>
</interest>
</interests>
</interestRepresentative>
<interestRepresentative>
<name>
<originalName>Nickel Institute</originalName>
</name>
<interests>
<interest index="1">
<name>Business and Industry</name>
</interest>
<interest index="2">
<name>Climate Action</name>
</interest>
<interest index="3">
<name>Consumers</name>
</interest>
<interest index="4">
<name>Economy, finance and the euro</name>
</interest>
<interest index="5">
<name>Employment and Social Affairs</name>
</interest>
<interest index="6">
<name>Energy</name>
</interest>
<interest index="7">
<name>Environment</name>
</interest>
<interest index="8">
<name>Food Safety</name>
</interest>
<interest index="9">
<name>Public Health</name>
</interest>
<interest index="10">
<name>Research and innovation</name>
</interest>
<interest index="11">
<name>Single market</name>
</interest>
<interest index="12">
<name>Trade</name>
</interest>
<interest index="13">
<name>Transport</name>
</interest>
</interests>
</interestRepresentative>
</resultList>
</ListOfIRPublicDetail>
try:
register_tree = ET.parse(REGISTER_XML)
register_root = register_tree.getroot()
except:
sys.exit("""⚠️ Impossible d'ouvrir le registre. \n\n""")
# Get all companies name
for xml in register_root.findall(".//{*}resultList/"):
print(xml.find("{*}name/{*}originalName").text)
print('\n=============\n')
# Get categories of interest
for xml in register_root.findall(".//{*}resultList/"):
for child in xml.findall("{*}interests/{*}interest/{*}name"):
print(child.text)
print('\n')
print('\n=============\n')
# Filter categories
for xml in register_root.findall(".//{*}resultList/{*}interests/{*}interest/[{*}name='Digital economy and society']"):
print(xml)
F. Hoffmann-La Roche Ltd
Nickel Institute
=============
Budget
Business and Industry
Climate Action
Competition
Consumers
Digital economy and society
Economy, finance and the euro
Environment
External Relations
Institutional affairs
International co-operation and development
Justice and Fundamental Rights
Public Health
Research and innovation
Single market
Trade
Business and Industry
Climate Action
Consumers
Economy, finance and the euro
Employment and Social Affairs
Energy
Environment
Food Safety
Public Health
Research and innovation
Single market
Trade
Transport
=============
在 this post 的帮助下...
from xml.etree import ElementTree as ET
from io import StringIO
data = '''\
<?xml version='1.0' encoding='UTF-8'?>
<ListOfIRPublicDetail xmlns:ns2="http://www.w3.org/1999/xlink" xmlns="http://intragate.ec.europa.eu/transparencyregister/intws/20200626">
<metaData>
<exportDate>2021-01-21T12:20:00.122+01:00</exportDate>
<numberOfIR>12205</numberOfIR>
</metaData>
<resultList>
<interestRepresentative>
<name>
<originalName>F. Hoffmann-La Roche Ltd</originalName>
</name>
<interests>
<interest index="1">
<name>Budget</name>
</interest>
<interest index="2">
<name>Business and Industry</name>
</interest>
<interest index="3">
<name>Climate Action</name>
</interest>
<interest index="4">
<name>Competition</name>
</interest>
<interest index="5">
<name>Consumers</name>
</interest>
<interest index="6">
<name>Digital economy and society</name>
</interest>
<interest index="7">
<name>Economy, finance and the euro</name>
</interest>
<interest index="8">
<name>Environment</name>
</interest>
<interest index="9">
<name>External Relations</name>
</interest>
<interest index="10">
<name>Institutional affairs</name>
</interest>
<interest index="11">
<name>International co-operation and development</name>
</interest>
<interest index="12">
<name>Justice and Fundamental Rights</name>
</interest>
<interest index="13">
<name>Public Health</name>
</interest>
<interest index="14">
<name>Research and innovation</name>
</interest>
<interest index="15">
<name>Single market</name>
</interest>
<interest index="16">
<name>Trade</name>
</interest>
</interests>
</interestRepresentative>
<interestRepresentative>
<name>
<originalName>Nickel Institute</originalName>
</name>
<interests>
<interest index="1">
<name>Business and Industry</name>
</interest>
<interest index="2">
<name>Climate Action</name>
</interest>
<interest index="3">
<name>Consumers</name>
</interest>
<interest index="4">
<name>Economy, finance and the euro</name>
</interest>
<interest index="5">
<name>Employment and Social Affairs</name>
</interest>
<interest index="6">
<name>Energy</name>
</interest>
<interest index="7">
<name>Environment</name>
</interest>
<interest index="8">
<name>Food Safety</name>
</interest>
<interest index="9">
<name>Public Health</name>
</interest>
<interest index="10">
<name>Research and innovation</name>
</interest>
<interest index="11">
<name>Single market</name>
</interest>
<interest index="12">
<name>Trade</name>
</interest>
<interest index="13">
<name>Transport</name>
</interest>
</interests>
</interestRepresentative>
</resultList>
</ListOfIRPublicDetail>
'''
f = StringIO(data)
tree = ET.parse(f)
ns = {'ns': 'http://intragate.ec.europa.eu/transparencyregister/intws/20200626'}
for e in tree.findall('.//*[.="Digital economy and society"]../../../ns:name/ns:originalName', namespaces=ns):
print(e.text)