带分页的 Selenium 网页抓取网站
Selenium web scraping site with pagination
我一直在研究如何使用 Selenium 创建网络抓取工具。我正在努力解决的一件事是用分页抓取页面。我写了一个我认为会抓取每一页的脚本
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import getpass
import datetime
import pandas as pd
custom_options = webdriver.ChromeOptions()
custom_options.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
driver = webdriver.Chrome(ChromeDriverManager().install(), options=custom_options)
driver.get("https://lr.caa.cz/letecky-rejstrik?lang=en")
data =[]
while(True):
try:
table_body = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "tbody")))
table_body_rows = table_body.find_elements_by_tag_name("tr")
button = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '/html/body/app-root/div/main/div/div/app-avreg-list/nav/div/app-pagination/div/a[3]/i')))
for i in table_body_rows:
row_data = []
table_data = i.find_elements_by_tag_name("td")
for j in table_data:
row_data.append(j.text.strip())
data.append(row_data)
button.click()
except:
break
df = pd.DataFrame(data)
print(df)
driver.quit()
它抓取了第一页,但似乎没有超出这一页。这是我得到的结果:
0 1 2 3
0 Glider MDM-1 FOX OK-1213
1 Glider MDM-1 FOX OK-7801
2 Glider A 15 OK-7906
3 Powered glider SZD-45A OK-6902
4 Powered glider SZD-45A OK-8903
5 Hot-air balloon AB OK-9004
6 Hot-air balloon AB OK-4012
7 Hot-air balloon AB OK-4014
8 Hot-air balloon AB OK-7006
9 Hot-air balloon AB OK-7004
10 None None None
我查看了网站上分页按钮的 xpath,它在脚本中似乎是正确的。
有什么想法可能是错误的吗?
而不是 presence_of_element_located()
使用 element_to_be_clickable()
并跟随 css 选择器或 xpath 来识别元素。
button = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'app-pagination a:nth-of-type(3)')))
或
button = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, "(//app-pagination//a)[3]")))
我一直在研究如何使用 Selenium 创建网络抓取工具。我正在努力解决的一件事是用分页抓取页面。我写了一个我认为会抓取每一页的脚本
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import getpass
import datetime
import pandas as pd
custom_options = webdriver.ChromeOptions()
custom_options.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
driver = webdriver.Chrome(ChromeDriverManager().install(), options=custom_options)
driver.get("https://lr.caa.cz/letecky-rejstrik?lang=en")
data =[]
while(True):
try:
table_body = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "tbody")))
table_body_rows = table_body.find_elements_by_tag_name("tr")
button = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '/html/body/app-root/div/main/div/div/app-avreg-list/nav/div/app-pagination/div/a[3]/i')))
for i in table_body_rows:
row_data = []
table_data = i.find_elements_by_tag_name("td")
for j in table_data:
row_data.append(j.text.strip())
data.append(row_data)
button.click()
except:
break
df = pd.DataFrame(data)
print(df)
driver.quit()
它抓取了第一页,但似乎没有超出这一页。这是我得到的结果:
0 1 2 3
0 Glider MDM-1 FOX OK-1213
1 Glider MDM-1 FOX OK-7801
2 Glider A 15 OK-7906
3 Powered glider SZD-45A OK-6902
4 Powered glider SZD-45A OK-8903
5 Hot-air balloon AB OK-9004
6 Hot-air balloon AB OK-4012
7 Hot-air balloon AB OK-4014
8 Hot-air balloon AB OK-7006
9 Hot-air balloon AB OK-7004
10 None None None
我查看了网站上分页按钮的 xpath,它在脚本中似乎是正确的。
有什么想法可能是错误的吗?
而不是 presence_of_element_located()
使用 element_to_be_clickable()
并跟随 css 选择器或 xpath 来识别元素。
button = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'app-pagination a:nth-of-type(3)')))
或
button = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, "(//app-pagination//a)[3]")))