亚马逊搜索结果页面中没有 id 可以用 selenium 查找
No id in Amazon search result page to find with selenium
我正在使用以下 python 代码:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import re
import time
class AmazonBot(object):
"""Parses relevant information from a text file consisting of
Amazon links."""
def __init__(self, items):
"""Setup bot for Amazon URL."""
self.amazon_url = "https://www.amazon.ca/"
self.items = items
self.profile = webdriver.FirefoxProfile()
self.options = Options()
#self.options.add_argument("--headless")
self.driver = webdriver.Firefox(firefox_profile=self.profile,
firefox_options=self.options)
# Navigate to the Amazon URL.
self.driver.get(self.amazon_url)
# Obtain the source
self.html = self.driver.page_source
self.soup = BeautifulSoup(self.html, 'html.parser')
self.html = self.soup.prettify('utf-8')
def search_items(self):
"""Searches through the list of items obtained from spreadsheet and
obtains name, price, and URL information for each item."""
urls = []
prices = []
names = []
for item in self.items:
print(f"Searching for {item}...")
self.driver.get(self.amazon_url)
#select = Select(self.driver.find_element_by_id("searchDropdownDescription"))
#select.select_by_visible_text('All Departments')
search_input = self.driver.find_element_by_id("twotabsearchtextbox")
search_input.send_keys(item)
time.sleep(2)
#wait = WebDriverWait(self.driver, self.explicit_wait)
#wait.until(EC.presence_of_all_elements_located((By.ID, "twotabsearchtextbox")))
search_button = self.driver.find_element_by_xpath('//*[@id="nav-search"]/form/div[2]/div/input')
search_button.click()
time.sleep(2)
t = self.driver.find_element_by_id("result_0")
asin = t.get_attribute("data-asin")
url = "https://www.amazon.ca/dp/" + asin
price = self.get_product_price(url)
name = self.get_product_name(url)
prices.append(price)
urls.append(url)
names.append(name)
print(name)
print(price)
print(url)
time.sleep(2)
return prices, urls, names
def get_product_price(self, url):
"""Gets and cleans product price from Amazon page.
If HTML attribute priceblock_ourprice or priceblock_dealprice
is absent, the price is marked as Not Available."""
self.driver.get(url)
try:
price = self.driver.find_element_by_id("priceblock_ourprice").text
except:
pass
try:
price = self.driver.find_element_by_id("priceblock_dealprice").text
except:
pass
if price is None:
price = "Not available"
else:
non_decimal = re.compile(r'[^\d.]+')
price = non_decimal.sub('', price)
return price
def get_product_name(self, url):
"""Returns the product name of the Amazon URL."""
self.driver.get(url)
try:
product_name = self.driver.find_element_by_id("productTitle").text
except:
pass
if product_name is None:
product_name = "Not available"
return product_name
def close_session(self):
"""Close the browser session."""
self.driver.close()
items=["toothpaste"]
amazon_bot=AmazonBot(items)
amazon_bot.search_items()
从亚马逊网站搜索。
我不断收到 NoSuchElementException: Message: Unable to locate element: [id="result_0"]
我在结果页面中找不到任何 id
来查找结果项目并从中获取信息。
正如您在下图中看到的,唯一可用的 id 是 cell_widget_id="MAIN-SEARCH_RESULTS-0"
。但此代码也未将其识别为 ID。
据我所知,您使用了错误的搜索按钮定位器。
尝试使用
search_button = self.driver.find_element_by_xpath('//input[@id='nav-search-submit-button']')
而不是
search_button = self.driver.find_element_by_xpath('//*[@id="nav-search"]/form/div[2]/div/input')
但这不是这里的主要问题。
搜索结果的块元素可以通过 //div[@data-component-type="s-search-result"]
xpath
定位,但它不是从 0 索引开始的,如果您正在寻找结果描述或图像,您应该使用其他定位器。
我替换了你的
search_button = self.driver.find_element_by_xpath('//*[@id="nav-search"]/form/div[2]/div/input')
search_button.click()
和
self.driver.get(f"https://www.amazon.ca/s?k={item}") # Directly go to the item's link rather than locating a button
我替换了你的
t = self.driver.find_element_by_id("result_0")
和
t = self.driver.find_elements_by_class_name("s-result-item")[1] # The 1 is to get the first card. Use 2 for the second card, etc. 0 is not a card.
一共:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import re
import time
class AmazonBot(object):
"""Parses relevant information from a text file consisting of
Amazon links."""
def __init__(self, items):
"""Setup bot for Amazon URL."""
self.amazon_url = "https://www.amazon.ca/"
self.items = items
self.profile = webdriver.FirefoxProfile()
self.options = Options()
#self.options.add_argument("--headless")
self.driver = webdriver.Firefox(firefox_profile=self.profile,
firefox_options=self.options)
# Navigate to the Amazon URL.
self.driver.get(self.amazon_url)
# Obtain the source
self.html = self.driver.page_source
self.soup = BeautifulSoup(self.html, 'html.parser')
self.html = self.soup.prettify('utf-8')
def search_items(self):
"""Searches through the list of items obtained from spreadsheet and
obtains name, price, and URL information for each item."""
urls = []
prices = []
names = []
for item in self.items:
print(f"Searching for {item}...")
self.driver.get(self.amazon_url)
#select = Select(self.driver.find_element_by_id("searchDropdownDescription"))
#select.select_by_visible_text('All Departments')
search_input = self.driver.find_element_by_id("twotabsearchtextbox")
search_input.send_keys(item)
time.sleep(2)
#wait = WebDriverWait(self.driver, self.explicit_wait)
#wait.until(EC.presence_of_all_elements_located((By.ID, "twotabsearchtextbox")))
self.driver.get(f"https://www.amazon.ca/s?k={item}")
time.sleep(2)
t = self.driver.find_elements_by_class_name("s-result-item")[1]
asin = t.get_attribute("data-asin")
url = "https://www.amazon.ca/dp/" + asin
price = self.get_product_price(url)
name = self.get_product_name(url)
prices.append(price)
urls.append(url)
names.append(name)
print(name)
print(price)
print(url)
time.sleep(2)
return prices, urls, names
def get_product_price(self, url):
"""Gets and cleans product price from Amazon page.
If HTML attribute priceblock_ourprice or priceblock_dealprice
is absent, the price is marked as Not Available."""
self.driver.get(url)
try:
price = self.driver.find_element_by_id("priceblock_ourprice").text
except:
pass
try:
price = self.driver.find_element_by_id("priceblock_dealprice").text
except:
pass
if price is None:
price = "Not available"
else:
non_decimal = re.compile(r'[^\d.]+')
price = non_decimal.sub('', price)
return price
def get_product_name(self, url):
"""Returns the product name of the Amazon URL."""
self.driver.get(url)
try:
product_name = self.driver.find_element_by_id("productTitle").text
except:
pass
if product_name is None:
product_name = "Not available"
return product_name
def close_session(self):
"""Close the browser session."""
self.driver.close()
items=["toothpaste"]
amazon_bot=AmazonBot(items)
amazon_bot.search_items()
输出:
Searching for toothpaste...
Hello Goodbye Plaque-Hello Whitening Toothpaste, Fluoride Free, Vegan & Sls Free, Natural Peppermint with Tea Tree & Coconut Oil, 98 mL
5.97
https://www.amazon.ca/dp/B08PJNBKWJ
我正在使用以下 python 代码:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import re
import time
class AmazonBot(object):
"""Parses relevant information from a text file consisting of
Amazon links."""
def __init__(self, items):
"""Setup bot for Amazon URL."""
self.amazon_url = "https://www.amazon.ca/"
self.items = items
self.profile = webdriver.FirefoxProfile()
self.options = Options()
#self.options.add_argument("--headless")
self.driver = webdriver.Firefox(firefox_profile=self.profile,
firefox_options=self.options)
# Navigate to the Amazon URL.
self.driver.get(self.amazon_url)
# Obtain the source
self.html = self.driver.page_source
self.soup = BeautifulSoup(self.html, 'html.parser')
self.html = self.soup.prettify('utf-8')
def search_items(self):
"""Searches through the list of items obtained from spreadsheet and
obtains name, price, and URL information for each item."""
urls = []
prices = []
names = []
for item in self.items:
print(f"Searching for {item}...")
self.driver.get(self.amazon_url)
#select = Select(self.driver.find_element_by_id("searchDropdownDescription"))
#select.select_by_visible_text('All Departments')
search_input = self.driver.find_element_by_id("twotabsearchtextbox")
search_input.send_keys(item)
time.sleep(2)
#wait = WebDriverWait(self.driver, self.explicit_wait)
#wait.until(EC.presence_of_all_elements_located((By.ID, "twotabsearchtextbox")))
search_button = self.driver.find_element_by_xpath('//*[@id="nav-search"]/form/div[2]/div/input')
search_button.click()
time.sleep(2)
t = self.driver.find_element_by_id("result_0")
asin = t.get_attribute("data-asin")
url = "https://www.amazon.ca/dp/" + asin
price = self.get_product_price(url)
name = self.get_product_name(url)
prices.append(price)
urls.append(url)
names.append(name)
print(name)
print(price)
print(url)
time.sleep(2)
return prices, urls, names
def get_product_price(self, url):
"""Gets and cleans product price from Amazon page.
If HTML attribute priceblock_ourprice or priceblock_dealprice
is absent, the price is marked as Not Available."""
self.driver.get(url)
try:
price = self.driver.find_element_by_id("priceblock_ourprice").text
except:
pass
try:
price = self.driver.find_element_by_id("priceblock_dealprice").text
except:
pass
if price is None:
price = "Not available"
else:
non_decimal = re.compile(r'[^\d.]+')
price = non_decimal.sub('', price)
return price
def get_product_name(self, url):
"""Returns the product name of the Amazon URL."""
self.driver.get(url)
try:
product_name = self.driver.find_element_by_id("productTitle").text
except:
pass
if product_name is None:
product_name = "Not available"
return product_name
def close_session(self):
"""Close the browser session."""
self.driver.close()
items=["toothpaste"]
amazon_bot=AmazonBot(items)
amazon_bot.search_items()
从亚马逊网站搜索。
我不断收到 NoSuchElementException: Message: Unable to locate element: [id="result_0"]
我在结果页面中找不到任何 id
来查找结果项目并从中获取信息。
正如您在下图中看到的,唯一可用的 id 是 cell_widget_id="MAIN-SEARCH_RESULTS-0"
。但此代码也未将其识别为 ID。
据我所知,您使用了错误的搜索按钮定位器。
尝试使用
search_button = self.driver.find_element_by_xpath('//input[@id='nav-search-submit-button']')
而不是
search_button = self.driver.find_element_by_xpath('//*[@id="nav-search"]/form/div[2]/div/input')
但这不是这里的主要问题。
搜索结果的块元素可以通过 //div[@data-component-type="s-search-result"]
xpath
定位,但它不是从 0 索引开始的,如果您正在寻找结果描述或图像,您应该使用其他定位器。
我替换了你的
search_button = self.driver.find_element_by_xpath('//*[@id="nav-search"]/form/div[2]/div/input')
search_button.click()
和
self.driver.get(f"https://www.amazon.ca/s?k={item}") # Directly go to the item's link rather than locating a button
我替换了你的
t = self.driver.find_element_by_id("result_0")
和
t = self.driver.find_elements_by_class_name("s-result-item")[1] # The 1 is to get the first card. Use 2 for the second card, etc. 0 is not a card.
一共:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import re
import time
class AmazonBot(object):
"""Parses relevant information from a text file consisting of
Amazon links."""
def __init__(self, items):
"""Setup bot for Amazon URL."""
self.amazon_url = "https://www.amazon.ca/"
self.items = items
self.profile = webdriver.FirefoxProfile()
self.options = Options()
#self.options.add_argument("--headless")
self.driver = webdriver.Firefox(firefox_profile=self.profile,
firefox_options=self.options)
# Navigate to the Amazon URL.
self.driver.get(self.amazon_url)
# Obtain the source
self.html = self.driver.page_source
self.soup = BeautifulSoup(self.html, 'html.parser')
self.html = self.soup.prettify('utf-8')
def search_items(self):
"""Searches through the list of items obtained from spreadsheet and
obtains name, price, and URL information for each item."""
urls = []
prices = []
names = []
for item in self.items:
print(f"Searching for {item}...")
self.driver.get(self.amazon_url)
#select = Select(self.driver.find_element_by_id("searchDropdownDescription"))
#select.select_by_visible_text('All Departments')
search_input = self.driver.find_element_by_id("twotabsearchtextbox")
search_input.send_keys(item)
time.sleep(2)
#wait = WebDriverWait(self.driver, self.explicit_wait)
#wait.until(EC.presence_of_all_elements_located((By.ID, "twotabsearchtextbox")))
self.driver.get(f"https://www.amazon.ca/s?k={item}")
time.sleep(2)
t = self.driver.find_elements_by_class_name("s-result-item")[1]
asin = t.get_attribute("data-asin")
url = "https://www.amazon.ca/dp/" + asin
price = self.get_product_price(url)
name = self.get_product_name(url)
prices.append(price)
urls.append(url)
names.append(name)
print(name)
print(price)
print(url)
time.sleep(2)
return prices, urls, names
def get_product_price(self, url):
"""Gets and cleans product price from Amazon page.
If HTML attribute priceblock_ourprice or priceblock_dealprice
is absent, the price is marked as Not Available."""
self.driver.get(url)
try:
price = self.driver.find_element_by_id("priceblock_ourprice").text
except:
pass
try:
price = self.driver.find_element_by_id("priceblock_dealprice").text
except:
pass
if price is None:
price = "Not available"
else:
non_decimal = re.compile(r'[^\d.]+')
price = non_decimal.sub('', price)
return price
def get_product_name(self, url):
"""Returns the product name of the Amazon URL."""
self.driver.get(url)
try:
product_name = self.driver.find_element_by_id("productTitle").text
except:
pass
if product_name is None:
product_name = "Not available"
return product_name
def close_session(self):
"""Close the browser session."""
self.driver.close()
items=["toothpaste"]
amazon_bot=AmazonBot(items)
amazon_bot.search_items()
输出:
Searching for toothpaste...
Hello Goodbye Plaque-Hello Whitening Toothpaste, Fluoride Free, Vegan & Sls Free, Natural Peppermint with Tea Tree & Coconut Oil, 98 mL
5.97
https://www.amazon.ca/dp/B08PJNBKWJ