如何在硒上抓取产品详细信息页面
How to scrape product details page on selenium
我正在学习硒。现在我的这段代码可以从 url https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn 的字体页面中抓取所有产品标题,但我想单击此页面的每个产品 link,这将带我到产品详细信息页面,所以我可以从产品详细信息页面中抓取信息。这是我的代码:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
#argument for incognito Chrome
option = webdriver.ChromeOptions()
option.add_argument(" — incognito")
browser = webdriver.Chrome()
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='c16H9d']")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
# find_elements_by_xpath returns an array of selenium objects.
titles_element = browser.find_elements_by_xpath("//div[@class='c16H9d']")
# use list comprehension to get the actual repo titles and not the selenium objects.
titles = [x.text for x in titles_element]
# print out all the titles.
print('titles:')
print(titles, '\n')
browser.quit()
您可以使用 BeautifulSoup
让生活更轻松。
我稍微修改了您的代码以说明您如何在一个页面上浏览所有单个产品链接。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
#argument for incognito Chrome
option = Options()
option.add_argument("--incognito")
browser = webdriver.Chrome(options=option)
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='c16H9d']")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
soup = BeautifulSoup(browser.page_source, "html.parser")
product_items = soup.find_all("div", attrs={"data-qa-locator": "product-item"})
for item in product_items:
item_url = f"https:{item.find('a')['href']}"
print(item_url)
browser.get(item_url)
item_soup = BeautifulSoup(browser.page_source, "html.parser")
# Use the item_soup to find details about the item from its url.
browser.quit()
简而言之,正是评论部分arundeep chohan提到的内容。您可以选择创建 browser
的新实例,例如 browser1 = webdriver.Chrome()
,它可以导航所有产品 URL。
此外,我发现 incognito
模式在您的脚本中不起作用。
您需要定义 chrome_options
并将其作为参数传递给 webdriver.Chrome
方法。
而不是点击你的意思,我建议获取 href
并一个一个打开。
您需要此定位器:By.XPATH, "//div[@class='c16H9d']//a"
,并使用 .visibility_of_all_elements_located
等待所有元素而不是 .visibility_of_element_located
。
之后,用这个方法获取 href : .get_attribute('href')
并使用已经获得的特定 href
打开一个新的 window。
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
elements = WebDriverWait(browser, timeout).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class='c16H9d']//a")))
for element in elements:
#get href
href = element.get_attribute('href')
print(href)
#open new window with specific href
browser.execute_script("window.open('" +href +"');")
# switch to new window
browser.switch_to.window(browser.window_handles[1])
#......now you are on the new window, scrape here
#example to scrape 'title' in the new window
xx = WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "pdp-mod-product-badge-title")))
print(xx.text)
#close the new window
browser.close()
#back to main window
browser.switch_to.window(browser.window_handles[0])
browser.quit()
我正在学习硒。现在我的这段代码可以从 url https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn 的字体页面中抓取所有产品标题,但我想单击此页面的每个产品 link,这将带我到产品详细信息页面,所以我可以从产品详细信息页面中抓取信息。这是我的代码:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
#argument for incognito Chrome
option = webdriver.ChromeOptions()
option.add_argument(" — incognito")
browser = webdriver.Chrome()
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='c16H9d']")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
# find_elements_by_xpath returns an array of selenium objects.
titles_element = browser.find_elements_by_xpath("//div[@class='c16H9d']")
# use list comprehension to get the actual repo titles and not the selenium objects.
titles = [x.text for x in titles_element]
# print out all the titles.
print('titles:')
print(titles, '\n')
browser.quit()
您可以使用 BeautifulSoup
让生活更轻松。
我稍微修改了您的代码以说明您如何在一个页面上浏览所有单个产品链接。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
#argument for incognito Chrome
option = Options()
option.add_argument("--incognito")
browser = webdriver.Chrome(options=option)
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='c16H9d']")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
soup = BeautifulSoup(browser.page_source, "html.parser")
product_items = soup.find_all("div", attrs={"data-qa-locator": "product-item"})
for item in product_items:
item_url = f"https:{item.find('a')['href']}"
print(item_url)
browser.get(item_url)
item_soup = BeautifulSoup(browser.page_source, "html.parser")
# Use the item_soup to find details about the item from its url.
browser.quit()
简而言之,正是评论部分arundeep chohan提到的内容。您可以选择创建 browser
的新实例,例如 browser1 = webdriver.Chrome()
,它可以导航所有产品 URL。
此外,我发现 incognito
模式在您的脚本中不起作用。
您需要定义 chrome_options
并将其作为参数传递给 webdriver.Chrome
方法。
而不是点击你的意思,我建议获取 href
并一个一个打开。
您需要此定位器:By.XPATH, "//div[@class='c16H9d']//a"
,并使用 .visibility_of_all_elements_located
等待所有元素而不是 .visibility_of_element_located
。
之后,用这个方法获取 href : .get_attribute('href')
并使用已经获得的特定 href
打开一个新的 window。
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
elements = WebDriverWait(browser, timeout).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class='c16H9d']//a")))
for element in elements:
#get href
href = element.get_attribute('href')
print(href)
#open new window with specific href
browser.execute_script("window.open('" +href +"');")
# switch to new window
browser.switch_to.window(browser.window_handles[1])
#......now you are on the new window, scrape here
#example to scrape 'title' in the new window
xx = WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "pdp-mod-product-badge-title")))
print(xx.text)
#close the new window
browser.close()
#back to main window
browser.switch_to.window(browser.window_handles[0])
browser.quit()