如何按时间顺序点击页面上的多个“+”按钮并使用python从中提取数据?
How to click on multiple "+" buttons on a page chronologically and extract data from them using python?
我需要从网站中提取 phone# 和网站链接以及大学的名称和国家/地区。该网站是 https://www.whed.net/results_institutions.php?Chp2=Business%20Administration,问题是有一个 +
标志,每所大学都需要单击该标志,然后需要提取数据,它需要关闭并转到下一个。
我尝试了多种通过selenium的方法如下:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import pandas as pd
#opening the web browser
browser = webdriver.Chrome('C:\Users\albert.malhotra\Desktop\Web Scrapings\Kentucky State\chromedriver')
#assigning the link to a variable
url = 'https://www.whed.net/results_institutions.php?Chp2=Business%20Administration'
#opening the url in browser while waiting 10 seconds for it to load
browser.get(url)
dfs = []
dfss = []
for n in range(50):
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
for data in soup.find_all('p' , {'class' : 'country'}):
item = data.text
for thead in soup.find_all('div', {'class' : 'details'}):
#data_2 = thead.find_all('a')
data_2 = thead.select('h3')
browser.find_element_by_link_text('More details').click()
html_2 = browser.page_source
soup_1 = BeautifulSoup(html_2, 'lxml')
name = []
for phone in soup_1.find_all('span' , {'class' : 'contenu'}):
data_3 = phone.text
name.append(data_3)
browser.find_element_by_class_name("fancybox-item fancybox-close").click()
dfss.append(data_2[0].text)
dfs.append(item)
如果仔细观察代码,+ 符号会在弹出窗口中打开一个 URL。所以在这种情况下,与其点击 + 按钮然后遍历弹出窗口,不如打开弹出窗口的 URL 然后遍历页面会很容易。这是执行此操作的代码。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
siteURL = "https://www.whed.net/results_institutions.php?Chp2=Business%20Administration"
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.get((siteURL))
#this will return all the URL's of popups in an array
search = browser.find_elements_by_class_name('fancybox');
#For test purpose I used only first link
print (search[0].get_attribute("href"))
#This opens the page that comes in first pop up. Just parse the source code and get your data.
browser.get(search[0].get_attribute("href"))
#You can run a loop loop to traverse the complete array of URL's.
要获得 URL 的数量,您可以使用数组的长度 属性。
要从 website 中提取大学的网站链接,您不需要 BeautifulSoup 和 Selenium 可以按照以下解决方案轻松提取所需数据:
代码块:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get('https://www.whed.net/results_institutions.php?Chp2=Business%20Administration')
elements = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "a.detail.fancybox[title='More details']")))
for element in elements:
WebDriverWait(driver, 30).until(EC.visibility_of(element)).click()
WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe.fancybox-iframe")))
print(WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.lien"))).get_attribute("innerHTML"))
driver.switch_to_default_content()
driver.find_element_by_css_selector("a.fancybox-item.fancybox-close").click()
driver.quit()
控制台输出:
http://www.uni-ruse.bg
http://www.vspu.hr
http://www.vfu.bg
http://www.uni-svishtov.bg
http://www.universitateagbaritiu.ro
http://www.shu-bg.net
http://universityecotesbenin.com
http://www.vps-libertas.hr
http://www.swu.bg
http://www.zrinski.org/nikola
注意:其余项目phone、名称和国家现在可以轻松提取了。
您不一定需要硒。您当然可以对大型结果集使用请求。该页面通过运行 SQL 查询的服务器检索数据,该查询具有记录数参数,您可以根据需要调整结果数 nbr_ref_pge
。
您可以编写一个 POST 请求来传递必要的信息,这些信息稍后会提供给 SQL 查询。现在您可以计算出批次的外观以获得所需的总数,并查看是否存在允许这样做的偏移量。
我对 asyncio
没有足够的经验,但我怀疑这是一个很好的方法,因为单个网站页面的请求数很高。我对 Session 的尝试是展示。我从 @datashaman
的回答中获取了重试语法
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
baseUrl = 'https://www.whed.net/'
searchTerm = 'Business Administration'
headers = {'Accept': 'application/json'}
params = {'Chp2' : searchTerm}
url = 'https://www.whed.net/results_institutions.php'
data = {
'where': "(FOS LIKE '%|" + searchTerm + "|%')",
'requete' : '(Fields of study=' + searchTerm + ')',
'ret' : 'home.php',
'afftri' : 'yes',
'stat' : 'Fields of study',
'sort' : 'InstNameEnglish,iBranchName',
'nbr_ref_pge' : '1000'
}
results = []
with requests.Session() as s:
retries = Retry(total=5,
backoff_factor=0.1,
status_forcelist=[ 500, 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
res = s.post(url, params = params, headers = headers, data = data)
soup = bs(res.content, 'lxml')
links = set([baseUrl + item['href'] for item in soup.select("[href*='detail_institution.php?']")])
for link in links:
res = s.get(link)
soup = bs(res.content, 'lxml')
items = soup.select('#contenu span')
name = soup.select_one('#contenu h2').text.strip()
country = soup.select_one('.country').text.strip()
i = 0
for item in items:
if 'Tel.' in item.text:
phone = items[i+1].text
if 'WWW:' in item.text:
website = items[i+1].text
i+=1
results.append([name, country, phone, website])
name = country = phone = website = ''
df = pd.DataFrame(results)
我需要从网站中提取 phone# 和网站链接以及大学的名称和国家/地区。该网站是 https://www.whed.net/results_institutions.php?Chp2=Business%20Administration,问题是有一个 +
标志,每所大学都需要单击该标志,然后需要提取数据,它需要关闭并转到下一个。
我尝试了多种通过selenium的方法如下:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import pandas as pd
#opening the web browser
browser = webdriver.Chrome('C:\Users\albert.malhotra\Desktop\Web Scrapings\Kentucky State\chromedriver')
#assigning the link to a variable
url = 'https://www.whed.net/results_institutions.php?Chp2=Business%20Administration'
#opening the url in browser while waiting 10 seconds for it to load
browser.get(url)
dfs = []
dfss = []
for n in range(50):
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
for data in soup.find_all('p' , {'class' : 'country'}):
item = data.text
for thead in soup.find_all('div', {'class' : 'details'}):
#data_2 = thead.find_all('a')
data_2 = thead.select('h3')
browser.find_element_by_link_text('More details').click()
html_2 = browser.page_source
soup_1 = BeautifulSoup(html_2, 'lxml')
name = []
for phone in soup_1.find_all('span' , {'class' : 'contenu'}):
data_3 = phone.text
name.append(data_3)
browser.find_element_by_class_name("fancybox-item fancybox-close").click()
dfss.append(data_2[0].text)
dfs.append(item)
如果仔细观察代码,+ 符号会在弹出窗口中打开一个 URL。所以在这种情况下,与其点击 + 按钮然后遍历弹出窗口,不如打开弹出窗口的 URL 然后遍历页面会很容易。这是执行此操作的代码。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
siteURL = "https://www.whed.net/results_institutions.php?Chp2=Business%20Administration"
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.get((siteURL))
#this will return all the URL's of popups in an array
search = browser.find_elements_by_class_name('fancybox');
#For test purpose I used only first link
print (search[0].get_attribute("href"))
#This opens the page that comes in first pop up. Just parse the source code and get your data.
browser.get(search[0].get_attribute("href"))
#You can run a loop loop to traverse the complete array of URL's.
要获得 URL 的数量,您可以使用数组的长度 属性。
要从 website 中提取大学的网站链接,您不需要 BeautifulSoup 和 Selenium 可以按照以下解决方案轻松提取所需数据:
代码块:
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC options = webdriver.ChromeOptions() options.add_argument('start-maximized') options.add_argument('disable-infobars') options.add_argument('--disable-extensions') driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe') driver.get('https://www.whed.net/results_institutions.php?Chp2=Business%20Administration') elements = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "a.detail.fancybox[title='More details']"))) for element in elements: WebDriverWait(driver, 30).until(EC.visibility_of(element)).click() WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe.fancybox-iframe"))) print(WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.lien"))).get_attribute("innerHTML")) driver.switch_to_default_content() driver.find_element_by_css_selector("a.fancybox-item.fancybox-close").click() driver.quit()
控制台输出:
http://www.uni-ruse.bg http://www.vspu.hr http://www.vfu.bg http://www.uni-svishtov.bg http://www.universitateagbaritiu.ro http://www.shu-bg.net http://universityecotesbenin.com http://www.vps-libertas.hr http://www.swu.bg http://www.zrinski.org/nikola
注意:其余项目phone、名称和国家现在可以轻松提取了。
您不一定需要硒。您当然可以对大型结果集使用请求。该页面通过运行 SQL 查询的服务器检索数据,该查询具有记录数参数,您可以根据需要调整结果数 nbr_ref_pge
。
您可以编写一个 POST 请求来传递必要的信息,这些信息稍后会提供给 SQL 查询。现在您可以计算出批次的外观以获得所需的总数,并查看是否存在允许这样做的偏移量。
我对 asyncio
没有足够的经验,但我怀疑这是一个很好的方法,因为单个网站页面的请求数很高。我对 Session 的尝试是展示。我从 @datashaman
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
baseUrl = 'https://www.whed.net/'
searchTerm = 'Business Administration'
headers = {'Accept': 'application/json'}
params = {'Chp2' : searchTerm}
url = 'https://www.whed.net/results_institutions.php'
data = {
'where': "(FOS LIKE '%|" + searchTerm + "|%')",
'requete' : '(Fields of study=' + searchTerm + ')',
'ret' : 'home.php',
'afftri' : 'yes',
'stat' : 'Fields of study',
'sort' : 'InstNameEnglish,iBranchName',
'nbr_ref_pge' : '1000'
}
results = []
with requests.Session() as s:
retries = Retry(total=5,
backoff_factor=0.1,
status_forcelist=[ 500, 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
res = s.post(url, params = params, headers = headers, data = data)
soup = bs(res.content, 'lxml')
links = set([baseUrl + item['href'] for item in soup.select("[href*='detail_institution.php?']")])
for link in links:
res = s.get(link)
soup = bs(res.content, 'lxml')
items = soup.select('#contenu span')
name = soup.select_one('#contenu h2').text.strip()
country = soup.select_one('.country').text.strip()
i = 0
for item in items:
if 'Tel.' in item.text:
phone = items[i+1].text
if 'WWW:' in item.text:
website = items[i+1].text
i+=1
results.append([name, country, phone, website])
name = country = phone = website = ''
df = pd.DataFrame(results)