使用 selenium 抓取数据
scrape data using selenium
程序 运行 不错,但他们只会抓取 one TITLE
我希望他们抓取页面中的所有标题 这是页面 link https://www.eurobike.com/en/index-exhibitors/exhibitors/?
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://www.eurobike.com/en/index-exhibitors/exhibitors/?'
driver.get(URL)
time.sleep(3)
# opt #1 visit first link, print the title uncomment to see
# click the single link
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.card-exhibitor"))).click()
time.sleep(2)
# parse the h1 tag text
title = driver.find_element(By.CSS_SELECTOR, 'h1.underlined').text
print(title)
driver.quit()
supplyvan_scraper()
试试这个,将“find_element”更改为“find_elements”,这样它就会得到所有列表:
titles = driver.find_elements(By.CSS_SELECTOR, 'h1.underlined')
for title in titles:
print(title.text)
该网站完全由复杂的 JavaScript.First 填充,要显示来自此 url 的列表,必须接受 cookie,但接受并单击 cookie 按钮不是简单的任务,因为 cookies 按钮在 shadow root (open)
selenium 下,而 webdriverWait 对 shadow root 无能为力,因此要执行 shadow root,您需要应用 JavaScript querySelector
.
完整工作代码:
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
#chrome to stay open to see what's happening in the real word or make it comment to close
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
URL ='https://www.eurobike.com/en/index-exhibitors/exhibitors/?'
driver.get(URL)
time.sleep(5)
#To execute shadow root and accept cookies
driver.execute_script('''return document.querySelector('div#usercentrics-root').shadowRoot.querySelector('button[data-testid="uc-accept-all-button"]')''').click()
#Grabbing all listing url and iterate,append and new deriver request
links=[]
for card in WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.list__results > div > div > a'))):
link=card.get_attribute('href')
links.append(link)
for u in links:
driver.get(u)
time.sleep(5)
#extracting desired data using bs4 to avoid much uses of selenium because of it's complexity and time killing
soup = BeautifulSoup(driver.page_source,'lxml')
title=soup.select_one('h1.underlined').get_text(strip=True)
print(title)
输出:
ANGLE is
A&C Solutions
A&J International Co.,Ltd
(Taiwan Branch)
A-Pro Tech Co., LTD
A-Rim Ent. Co., Ltd.
Abbey Bike Tools
ABIMOTA
Associacao Nacional das Industrias
de Duas Rodas, Ferragens, Mobiliári
ABIMOTA
Associacao Nacional das Industrias
de Duas Rodas, Ferragens, Mobiliári
ABUS |August Bremicker Söhne KG
ABUS |August Bremicker Söhne KG
Accelerated Systems Inc. (ASI)
ACCORD ENTERPRISE CORP.
Acer Gadget Inc.
Acetrikes Industrial Co., Ltd.
ACT LAB LLC
ACTIA
Action Sports SRL
Activent 365 s.r.o.
ADAC e.V.
ADD-ONE
AddBike
AddRE-Mo
(Electric Bike Solutions GmbH)
ADFC e. V.
Adhestick Innovations Ltd. (Joe's No Flats)
ADViTEX GMBH
Äike
AER Electric Company Ltd
King Edward House
Aero Sensor Ltd
Aeroe Limited
Aforge Enterprise Co., Ltd
Agentura REPRO spol. s r.o.
程序 运行 不错,但他们只会抓取 one TITLE
我希望他们抓取页面中的所有标题 这是页面 link https://www.eurobike.com/en/index-exhibitors/exhibitors/?
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://www.eurobike.com/en/index-exhibitors/exhibitors/?'
driver.get(URL)
time.sleep(3)
# opt #1 visit first link, print the title uncomment to see
# click the single link
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.card-exhibitor"))).click()
time.sleep(2)
# parse the h1 tag text
title = driver.find_element(By.CSS_SELECTOR, 'h1.underlined').text
print(title)
driver.quit()
supplyvan_scraper()
试试这个,将“find_element”更改为“find_elements”,这样它就会得到所有列表:
titles = driver.find_elements(By.CSS_SELECTOR, 'h1.underlined')
for title in titles:
print(title.text)
该网站完全由复杂的 JavaScript.First 填充,要显示来自此 url 的列表,必须接受 cookie,但接受并单击 cookie 按钮不是简单的任务,因为 cookies 按钮在 shadow root (open)
selenium 下,而 webdriverWait 对 shadow root 无能为力,因此要执行 shadow root,您需要应用 JavaScript querySelector
.
完整工作代码:
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
#chrome to stay open to see what's happening in the real word or make it comment to close
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
URL ='https://www.eurobike.com/en/index-exhibitors/exhibitors/?'
driver.get(URL)
time.sleep(5)
#To execute shadow root and accept cookies
driver.execute_script('''return document.querySelector('div#usercentrics-root').shadowRoot.querySelector('button[data-testid="uc-accept-all-button"]')''').click()
#Grabbing all listing url and iterate,append and new deriver request
links=[]
for card in WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.list__results > div > div > a'))):
link=card.get_attribute('href')
links.append(link)
for u in links:
driver.get(u)
time.sleep(5)
#extracting desired data using bs4 to avoid much uses of selenium because of it's complexity and time killing
soup = BeautifulSoup(driver.page_source,'lxml')
title=soup.select_one('h1.underlined').get_text(strip=True)
print(title)
输出:
ANGLE is
A&C Solutions
A&J International Co.,Ltd
(Taiwan Branch)
A-Pro Tech Co., LTD
A-Rim Ent. Co., Ltd.
Abbey Bike Tools
ABIMOTA
Associacao Nacional das Industrias
de Duas Rodas, Ferragens, Mobiliári
ABIMOTA
Associacao Nacional das Industrias
de Duas Rodas, Ferragens, Mobiliári
ABUS |August Bremicker Söhne KG
ABUS |August Bremicker Söhne KG
Accelerated Systems Inc. (ASI)
ACCORD ENTERPRISE CORP.
Acer Gadget Inc.
Acetrikes Industrial Co., Ltd.
ACT LAB LLC
ACTIA
Action Sports SRL
Activent 365 s.r.o.
ADAC e.V.
ADD-ONE
AddBike
AddRE-Mo
(Electric Bike Solutions GmbH)
ADFC e. V.
Adhestick Innovations Ltd. (Joe's No Flats)
ADViTEX GMBH
Äike
AER Electric Company Ltd
King Edward House
Aero Sensor Ltd
Aeroe Limited
Aforge Enterprise Co., Ltd
Agentura REPRO spol. s r.o.