如何使用 lxml 和 python 在特定的嵌套标签中查找文本?
How to find text in specific nested tag wih lxml and python?
假设html来源如下:
<html><body>
<div class="aname">
<div class="bname">
<h5><a href="url_a0" class="cname">aTitle</a></h5>
</div>
<div class="">
<div><img src="url_a1"/>img_text<br /></div>
<div><strong>label_a1:</strong>text_a1<br /></div>
<div><strong>label_a2:</strong>text_a2<br /></div>
<div><strong>label_spe:<a href="url_a2">*</a>:</strong>
<span class="box-span" >spantext_a1</span>
<span class="box-span" >spantext_a2</span>
<span class="box-span" >spantext_a3</span><br />
</div>
</div>
</div>
<div class="aname">
<div class="bname">
<h5><a href="url_a0" class="cname">aTitle</a></h5>
</div>
<div class="">
<div><img src="url_a1"/>img_text<br /></div>
<div><strong>label_a1:</strong>text_b1<br /></div>
<div><strong>label_a2:</strong>text_b2<br /></div>
<div><strong>label_a3:</strong>text_b3<br /></div>
<div><strong>label_spe:<a href="url_b3">*</a>:</strong>
<span class="box-span" >spantext_b1</span>
<span class="box-span" >spantext_b2</span>
<span class="box-span" >spantext_b3</span>
<span class="box-span" >spantext_b4</span>
<span class="box-span" >spantext_b5</span>
<span class="box-span" >spantext_b6</span><br />
</div>
</div>
</div>
</body></html>
如果我想要输出是:
aTitle
url_a0
label_a1:
text_a1
label_a2:
text_a2
label_spe:
spantext_a1
spantext_a2
spantext_a3
aTitle
url_a0
label_a1:
text_b1
label_a2:
text_b2
label_a3:
text_b3
label_spe:
spantext_b1
spantext_b2
spantext_b3
spantext_b4
spantext_b5
spantext_b6
我想在 python 中使用 lxml!!请帮助我,我该怎么办?
由于 html 有多个 div,并且 span 的数量是可变的。
我已经尝试了很多次,但仍然无法获得正确的输出。
最后,我希望我能从这里得到一些有用的信息!!
我的代码如下:
# -*- coding:utf-8 -*-
import codecs
import lxml,re
import re
from lxml import etree
from lxml.html.clean import Cleaner
def main():
pass
if __name__ == '__main__':
main()
ff = codecs.open('test.html','r',errors='ignore',encoding='utf-8')
html0 = ff.read()
html1 = re.sub('<strong>', '',html0)
html2 = re.sub('</strong>','',html1)
html = re.sub('class=\"box-span\"','',html2)
spelabels = ['img_text', 'label_a1', 'label_a2', 'label_a3']
root = lxml.html.fromstring(html)
contents = root.xpath('.//div[@class="aname"]/div[@class=""]/div/text()')
for content in contents:
if content[0:8] in spelabels:
print(content[0:8])
print(content[9:])
elif content == "label_spe:":
print(content)
nestedcontents = root.xpath('.//div[@class="aname"]/div[@class=""]/div[text()="label_spe:"]/following-sibling::span/text()')
print(nestedcontents)
for nestedcontent in nestedcontents:
print(nestcontent)
输出:
img_text
label_a1
text_a1
label_a2
text_a2
label_spe:
[]
img_text
label_a1
text_b1
label_a2
text_b2
label_a3
text_b3
label_spe:
[]
似乎部分有效,但我不知道如何提取 url_a1。
span中的文字没有出现
这是我的尝试。它为您的样本输入提供所需的输出。我让它容忍某些标签更改,例如 div
与 span
.
import xml.etree.cElementTree as etree # or: from lxml import etree
body = etree.parse('test.html').find('body')
for aname in body.iterfind('*[@class="aname"]'):
cname = aname.find('*[@class="bname"]//a[@class="cname"]')
print cname.text # title
print cname.get('href') # url
for div in aname.iterfind('div[@class=""]/div'):
strong = div.find('strong')
if strong is not None:
print strong.text # label
text = div[0].tail.strip() #
if text:
print text
else:
for box in div.iterfind('*[@class="box-span"]'):
print box.text
print
import re
file1 = open("input.txt",'r')
for lines in file1:
match = re.findall(">\w*:*<", lines)
for ele in match:
a = ele.split('>')
for ele1 in a:
b = ele1.split('<')
for i in b:
if i and (i !=":"):
print i
输出:
aTitle
img_text
label_a1:
text_a1
label_a2:
text_a2
label_spe:
spantext_a1
spantext_a2
spantext_a3
aTitle
img_text
label_a1:
text_b1
label_a2:
text_b2
label_a3:
text_b3
label_spe:
spantext_b1
spantext_b2
spantext_b3
spantext_b4
spantext_b5
spantext_b6
假设html来源如下:
<html><body>
<div class="aname">
<div class="bname">
<h5><a href="url_a0" class="cname">aTitle</a></h5>
</div>
<div class="">
<div><img src="url_a1"/>img_text<br /></div>
<div><strong>label_a1:</strong>text_a1<br /></div>
<div><strong>label_a2:</strong>text_a2<br /></div>
<div><strong>label_spe:<a href="url_a2">*</a>:</strong>
<span class="box-span" >spantext_a1</span>
<span class="box-span" >spantext_a2</span>
<span class="box-span" >spantext_a3</span><br />
</div>
</div>
</div>
<div class="aname">
<div class="bname">
<h5><a href="url_a0" class="cname">aTitle</a></h5>
</div>
<div class="">
<div><img src="url_a1"/>img_text<br /></div>
<div><strong>label_a1:</strong>text_b1<br /></div>
<div><strong>label_a2:</strong>text_b2<br /></div>
<div><strong>label_a3:</strong>text_b3<br /></div>
<div><strong>label_spe:<a href="url_b3">*</a>:</strong>
<span class="box-span" >spantext_b1</span>
<span class="box-span" >spantext_b2</span>
<span class="box-span" >spantext_b3</span>
<span class="box-span" >spantext_b4</span>
<span class="box-span" >spantext_b5</span>
<span class="box-span" >spantext_b6</span><br />
</div>
</div>
</div>
</body></html>
如果我想要输出是:
aTitle
url_a0
label_a1:
text_a1
label_a2:
text_a2
label_spe:
spantext_a1
spantext_a2
spantext_a3
aTitle
url_a0
label_a1:
text_b1
label_a2:
text_b2
label_a3:
text_b3
label_spe:
spantext_b1
spantext_b2
spantext_b3
spantext_b4
spantext_b5
spantext_b6
我想在 python 中使用 lxml!!请帮助我,我该怎么办? 由于 html 有多个 div,并且 span 的数量是可变的。 我已经尝试了很多次,但仍然无法获得正确的输出。 最后,我希望我能从这里得到一些有用的信息!! 我的代码如下:
# -*- coding:utf-8 -*-
import codecs
import lxml,re
import re
from lxml import etree
from lxml.html.clean import Cleaner
def main():
pass
if __name__ == '__main__':
main()
ff = codecs.open('test.html','r',errors='ignore',encoding='utf-8')
html0 = ff.read()
html1 = re.sub('<strong>', '',html0)
html2 = re.sub('</strong>','',html1)
html = re.sub('class=\"box-span\"','',html2)
spelabels = ['img_text', 'label_a1', 'label_a2', 'label_a3']
root = lxml.html.fromstring(html)
contents = root.xpath('.//div[@class="aname"]/div[@class=""]/div/text()')
for content in contents:
if content[0:8] in spelabels:
print(content[0:8])
print(content[9:])
elif content == "label_spe:":
print(content)
nestedcontents = root.xpath('.//div[@class="aname"]/div[@class=""]/div[text()="label_spe:"]/following-sibling::span/text()')
print(nestedcontents)
for nestedcontent in nestedcontents:
print(nestcontent)
输出:
img_text
label_a1
text_a1
label_a2
text_a2
label_spe:
[]
img_text
label_a1
text_b1
label_a2
text_b2
label_a3
text_b3
label_spe:
[]
似乎部分有效,但我不知道如何提取 url_a1。 span中的文字没有出现
这是我的尝试。它为您的样本输入提供所需的输出。我让它容忍某些标签更改,例如 div
与 span
.
import xml.etree.cElementTree as etree # or: from lxml import etree
body = etree.parse('test.html').find('body')
for aname in body.iterfind('*[@class="aname"]'):
cname = aname.find('*[@class="bname"]//a[@class="cname"]')
print cname.text # title
print cname.get('href') # url
for div in aname.iterfind('div[@class=""]/div'):
strong = div.find('strong')
if strong is not None:
print strong.text # label
text = div[0].tail.strip() #
if text:
print text
else:
for box in div.iterfind('*[@class="box-span"]'):
print box.text
print
import re
file1 = open("input.txt",'r')
for lines in file1:
match = re.findall(">\w*:*<", lines)
for ele in match:
a = ele.split('>')
for ele1 in a:
b = ele1.split('<')
for i in b:
if i and (i !=":"):
print i
输出:
aTitle
img_text
label_a1:
text_a1
label_a2:
text_a2
label_spe:
spantext_a1
spantext_a2
spantext_a3
aTitle
img_text
label_a1:
text_b1
label_a2:
text_b2
label_a3:
text_b3
label_spe:
spantext_b1
spantext_b2
spantext_b3
spantext_b4
spantext_b5
spantext_b6