如何使用 bs4 从复选框中获取文本?
How to get text from checked boxes using bs4?
我正在尝试从以下站点上勾选(或回答的问题)的框中获取所有标签(文本)。
然而我似乎没有得到任何文本。
此外,我想到的进行抓取的方式是首先收集所有 links - 在右侧,您可以在页面之间切换。这个列表似乎也有 links 次 2...
这是我当前的代码(请参阅其中的 link,也称为 main_url
)
import bs4 as bs
from splinter import Browser
import time
executable_path = {'executable_path' :'C:/users/chromedriver.exe'}
browser = Browser('chrome', **executable_path)
main_url = 'https://reporting.unpri.org/surveys/PRI-Reporting-Framework-
2016/0ad07cdc-cfbc-4c5b-a79f-
2b07e93d8521/79894dbc337a40828d895f9402aa63de/html/2/?lang=&a=1'
browser.visit(main_url)
source = browser.html
soup = bs.BeautifulSoup(source, 'lxml')
base_url = main_url[:-51]
urls = []
print(base_url)
for i in soup.find_all('div', class_ = 'accordion-inner n-accordion-link'):
for j in soup.find_all('a', class_ = 'tooltiper'):
urls.append(j['href'])
print(urls)
result = []
for k in urls:
ext = k[8:]
browser.visit(base_url + ext)
source1 = browser.html
soup1 = bs.BeautifulSoup(source1, 'lxml')
temp_list = []
print(browser.url)
for img in soup1.find_all('img', class_ = 'readradio'):
for t in img['src']:
if t == '/Style/img/checkedradio.png':
for x in soup1.find_all('span', class_ = 'title'):
txt = str(x.string)
temp_list.append(txt)
result.append(temp_list)
print(result)
我得到结果列表的以下输出,其中应该包含文本:
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
根据建议更新了代码:
import bs4 as bs
from splinter import Browser
import time
executable_path = {'executable_path'
:'/users/nichlasrasmussen/documents/webdrivers/phantomjs'}
browser = Browser('phantomjs', **executable_path)
main_url = 'https://reporting.unpri.org/surveys/PRI-Reporting-Framework-
2016/0ad07cdc-cfbc-4c5b-a79f-
2b07e93d8521/79894dbc337a40828d895f9402aa63de/html/2/?lang=&a=1'
browser.visit(main_url)
source = browser.html
soup = bs.BeautifulSoup(source, 'lxml')
base_url = main_url[:-51]
urls = []
print(base_url)
for i in soup.find_all('div', class_ = 'accordion-inner n-accordion-link'):
for j in soup.find_all('a', class_ = 'tooltiper'):
urls.append(j['href'])
print(urls)
result = []
for k in urls:
ext = k[8:]
browser.visit(base_url + ext)
source1 = browser.html
soup1 = bs.BeautifulSoup(source1, 'lxml')
temp_list = []
print(browser.url)
for label in soup1.find_all('label', class_='radio'):
t = label.find('img', class_='readradio')
if 'checkedradio' in t['src']:
content = soup1.find('span', class_='title')
temp_list.append(content.text)
result.append(temp_list)
print(result)
您基本上可以只引用 img
和 span.title
元素的父元素,即 label.radio
.
无需从根目录开始进行大量循环(soup1
)
试试这个:
for label in soup1.find_all('label', class_='radio'):
t = label.find('img', class_='readradio')
if t and '/Style/img/checkedradio.png' in t.get('src'):
content = label.find('span', class_='title')
temp_list.append(content.text)
我正在尝试从以下站点上勾选(或回答的问题)的框中获取所有标签(文本)。
然而我似乎没有得到任何文本。
此外,我想到的进行抓取的方式是首先收集所有 links - 在右侧,您可以在页面之间切换。这个列表似乎也有 links 次 2...
这是我当前的代码(请参阅其中的 link,也称为 main_url
)
import bs4 as bs
from splinter import Browser
import time
executable_path = {'executable_path' :'C:/users/chromedriver.exe'}
browser = Browser('chrome', **executable_path)
main_url = 'https://reporting.unpri.org/surveys/PRI-Reporting-Framework-
2016/0ad07cdc-cfbc-4c5b-a79f-
2b07e93d8521/79894dbc337a40828d895f9402aa63de/html/2/?lang=&a=1'
browser.visit(main_url)
source = browser.html
soup = bs.BeautifulSoup(source, 'lxml')
base_url = main_url[:-51]
urls = []
print(base_url)
for i in soup.find_all('div', class_ = 'accordion-inner n-accordion-link'):
for j in soup.find_all('a', class_ = 'tooltiper'):
urls.append(j['href'])
print(urls)
result = []
for k in urls:
ext = k[8:]
browser.visit(base_url + ext)
source1 = browser.html
soup1 = bs.BeautifulSoup(source1, 'lxml')
temp_list = []
print(browser.url)
for img in soup1.find_all('img', class_ = 'readradio'):
for t in img['src']:
if t == '/Style/img/checkedradio.png':
for x in soup1.find_all('span', class_ = 'title'):
txt = str(x.string)
temp_list.append(txt)
result.append(temp_list)
print(result)
我得到结果列表的以下输出,其中应该包含文本:
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
根据建议更新了代码:
import bs4 as bs
from splinter import Browser
import time
executable_path = {'executable_path'
:'/users/nichlasrasmussen/documents/webdrivers/phantomjs'}
browser = Browser('phantomjs', **executable_path)
main_url = 'https://reporting.unpri.org/surveys/PRI-Reporting-Framework-
2016/0ad07cdc-cfbc-4c5b-a79f-
2b07e93d8521/79894dbc337a40828d895f9402aa63de/html/2/?lang=&a=1'
browser.visit(main_url)
source = browser.html
soup = bs.BeautifulSoup(source, 'lxml')
base_url = main_url[:-51]
urls = []
print(base_url)
for i in soup.find_all('div', class_ = 'accordion-inner n-accordion-link'):
for j in soup.find_all('a', class_ = 'tooltiper'):
urls.append(j['href'])
print(urls)
result = []
for k in urls:
ext = k[8:]
browser.visit(base_url + ext)
source1 = browser.html
soup1 = bs.BeautifulSoup(source1, 'lxml')
temp_list = []
print(browser.url)
for label in soup1.find_all('label', class_='radio'):
t = label.find('img', class_='readradio')
if 'checkedradio' in t['src']:
content = soup1.find('span', class_='title')
temp_list.append(content.text)
result.append(temp_list)
print(result)
您基本上可以只引用 img
和 span.title
元素的父元素,即 label.radio
.
无需从根目录开始进行大量循环(soup1
)
试试这个:
for label in soup1.find_all('label', class_='radio'):
t = label.find('img', class_='readradio')
if t and '/Style/img/checkedradio.png' in t.get('src'):
content = label.find('span', class_='title')
temp_list.append(content.text)