使用 ElementTree 从 xml 标签获取所有 children
Get all the children from xml tag using ElementTree
我正在尝试使用 ElementTree 解析 XML 文件,有时我只得到第一个 child 而不是标签内的所有 children - 以下是我的 XML结构:-
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>
<sentence id="2339">
<text>I charge it at night and skip taking the cord with me because of the good battery life.</text>
<aspectTerms>
<aspectTerm term="cord" polarity="neutral" from="41" to="45"/>
<aspectTerm term="battery life" polarity="positive" from="74" to="86"/>
</aspectTerms>
</sentence>
<sentence id="812">
<text>I bought a HP Pavilion DV4-1222nr laptop and have had so many problems with the computer.</text>
</sentence>
<sentence id="1316">
<text>The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the "sales" team, which is the retail shop which I bought my netbook from.</text>
<aspectTerms>
<aspectTerm term="service center" polarity="negative" from="27" to="41"/>
<aspectTerm term=""sales" team" polarity="negative" from="109" to="121"/>
<aspectTerm term="tech guy" polarity="neutral" from="4" to="12"/>
</aspectTerms>
</sentence>
</sentences>
我想在每个 'aspectTerm' 标签中得到 'term'。以下是我的代码:-
import xml.etree.ElementTree as ET
tree = ET.parse('Laptops_Train.xml')
root = tree.getroot()
df = pd.DataFrame()
def getAspect(sentences):
reviewList = []
text = sentence.find('text').text
reviewList.append(text)
for aspectTerms in sentence.iter('aspectTerms'):
#for aspectTerm in aspectTerms.iter('aspectTerm'):
aspect = aspectTerms.find('aspectTerm').get('term')
print(aspect)
return aspect
aspectList = []
for sentences in root.iter('sentences'):
for sentence in sentences.iter('sentence'):
aspectList.append(getAspect(sentence))
实际结果:
cord
class 'NoneType'
service center
预期结果:
[cord, battery life]
[]
[service center,"sales" team, tech guy]
提前致谢
使用具有 xpath 的 lxml 库更容易做到这一点。
>>> from lxml import etree
>>> tree = etree.parse('Laptops_Train.xml')
>>> for aspectTerms in tree.xpath('.//aspectTerms'):
... aspectTerms.xpath('aspectTerm/@term')
...
['cord', 'battery life']
['service center', '"sales" team', 'tech guy']
还要注意所有 aspectTerms
都有一个 Term
属性;没有空的会产生 None
.
编辑,受评论启发。
>>> from lxml import etree
>>> tree = etree.parse('Laptops_Train.xml')
>>> for sentence in tree.xpath('.//sentence'):
... sentence.xpath('.//aspectTerm/@term')
...
['cord', 'battery life']
[]
['service center', '"sales" team', 'tech guy']
所以解决方案是使用“.findall”而不是 .find。因为 '.findall' 选择了所有 children。我的解决方法如下:-
def getAspect(sentences):
aspectList = []
reviewList = []
text = sentence.find('text').text
reviewList.append(text)
for aspectTerms in sentence.iter('aspectTerms'):
#for aspectTerm in aspectTerms.iter('aspectTerm'):
aspect = aspectTerms.findall('aspectTerm')#.get('term')
for aspectElem in aspect:
aspects = aspectElem.get('term')
aspectList.append(aspects)
print(aspects)
return aspectList
aspectList = []
for sentences in root.iter('sentences'):
for sentence in sentences.iter('sentence'):
aspectList.append(getAspect(sentence))
我正在尝试使用 ElementTree 解析 XML 文件,有时我只得到第一个 child 而不是标签内的所有 children - 以下是我的 XML结构:-
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>
<sentence id="2339">
<text>I charge it at night and skip taking the cord with me because of the good battery life.</text>
<aspectTerms>
<aspectTerm term="cord" polarity="neutral" from="41" to="45"/>
<aspectTerm term="battery life" polarity="positive" from="74" to="86"/>
</aspectTerms>
</sentence>
<sentence id="812">
<text>I bought a HP Pavilion DV4-1222nr laptop and have had so many problems with the computer.</text>
</sentence>
<sentence id="1316">
<text>The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the "sales" team, which is the retail shop which I bought my netbook from.</text>
<aspectTerms>
<aspectTerm term="service center" polarity="negative" from="27" to="41"/>
<aspectTerm term=""sales" team" polarity="negative" from="109" to="121"/>
<aspectTerm term="tech guy" polarity="neutral" from="4" to="12"/>
</aspectTerms>
</sentence>
</sentences>
我想在每个 'aspectTerm' 标签中得到 'term'。以下是我的代码:-
import xml.etree.ElementTree as ET
tree = ET.parse('Laptops_Train.xml')
root = tree.getroot()
df = pd.DataFrame()
def getAspect(sentences):
reviewList = []
text = sentence.find('text').text
reviewList.append(text)
for aspectTerms in sentence.iter('aspectTerms'):
#for aspectTerm in aspectTerms.iter('aspectTerm'):
aspect = aspectTerms.find('aspectTerm').get('term')
print(aspect)
return aspect
aspectList = []
for sentences in root.iter('sentences'):
for sentence in sentences.iter('sentence'):
aspectList.append(getAspect(sentence))
实际结果:
cord
class 'NoneType'
service center
预期结果:
[cord, battery life]
[]
[service center,"sales" team, tech guy]
提前致谢
使用具有 xpath 的 lxml 库更容易做到这一点。
>>> from lxml import etree
>>> tree = etree.parse('Laptops_Train.xml')
>>> for aspectTerms in tree.xpath('.//aspectTerms'):
... aspectTerms.xpath('aspectTerm/@term')
...
['cord', 'battery life']
['service center', '"sales" team', 'tech guy']
还要注意所有 aspectTerms
都有一个 Term
属性;没有空的会产生 None
.
编辑,受评论启发。
>>> from lxml import etree
>>> tree = etree.parse('Laptops_Train.xml')
>>> for sentence in tree.xpath('.//sentence'):
... sentence.xpath('.//aspectTerm/@term')
...
['cord', 'battery life']
[]
['service center', '"sales" team', 'tech guy']
所以解决方案是使用“.findall”而不是 .find。因为 '.findall' 选择了所有 children。我的解决方法如下:-
def getAspect(sentences):
aspectList = []
reviewList = []
text = sentence.find('text').text
reviewList.append(text)
for aspectTerms in sentence.iter('aspectTerms'):
#for aspectTerm in aspectTerms.iter('aspectTerm'):
aspect = aspectTerms.findall('aspectTerm')#.get('term')
for aspectElem in aspect:
aspects = aspectElem.get('term')
aspectList.append(aspects)
print(aspects)
return aspectList
aspectList = []
for sentences in root.iter('sentences'):
for sentence in sentences.iter('sentence'):
aspectList.append(getAspect(sentence))