有效的 Xpath 但返回空 div - Amazon 抓取
Valid Xpath but empty div returned - Amazon scraping
我在使用一个小的抓取脚本从 Amazon 页面检索信息时遇到了一些困难。下面是我的代码:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import random
import time
from bs4 import BeautifulSoup
sleep_time_min = 5
sleep_time_max = 10
### INFORMATION FOR PROXY, UA & INFO ROTATION ###
user_agent_list = ['Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13']
opts = Options()
user_agent = random.choice(user_agent_list)
opts.add_argument("user-agent="+user_agent)
driver = webdriver.Chrome(executable_path='XXXXXXXX', options=opts)
driver.get('https://www.amazon.com/gp/product/B00J4B0S4O')
soup = BeautifulSoup(driver.page_source, 'lxml')
sleep_time_range = range(sleep_time_min,sleep_time_max)
sleep_time = random.choice(sleep_time_range)
time.sleep(sleep_time)
#Extract seller rank & sales category
try:
rank = driver.find_element_by_xpath('//div[@id="detailBullets_feature_div"]/ul[@class="a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list"]/li/span[@class="a-list-item"]/span[@class="a-text-bold"]')
#rank = driver.find_element_by_xpath('//div[@id="detail-bullets_feature_div"]').text
#rank = driver.find_element_by_xpath('//div[@id="a-page"]').text
except:
rank = "NA"
print(rank)
driver.close()
所以基本上我试图在页面上检索以下信息:
Best Sellers Rank: #711 in Grocery & Gourmet Food (See Top 100 in Grocery & Gourmet Food)
#1 in Grapeseed Oils
然后将其切片,以便我可以将排名和类别存储到变量中。
这是我的问题,尽管我尽了一切努力,我的 xpath 仍然返回空结果。我已经离开了我在代码中使用的其他 xpath(即高级 div,但不幸的是结果是一样的)。
不幸的是,我不明白为什么这些结果仍然是空的。你有什么想法吗?
非常感谢您的帮助
我最终以丑陋的方式完成了它,不是很漂亮但是工作:)
try:
rank_main = soup.find_all('span','a-list-item')
rank_main = str(rank_main)
rank_main = rank_main.split("(<a href")[0]
rank_main = rank_main.split("#")[1]
rank_main = rank_main.replace("amp;","")
rank1bis_nb = rank_main.split('in')[0]
rank1bis_cat = rank_main.split('in ')[1]
except:
rank1bis_nb = "NA"
rank1bis_cat = "NA"
try:
rank_raw = soup.find_all('ul','a-unordered-list a-nostyle a-vertical zg_hrsr')
rank_raw = str(rank_raw)
rank_raw = rank_raw.replace("[","")
rank_raw = rank_raw.replace("]","")
rank_raw = rank_raw.replace("#","")
rank_raw = re.sub('<[^>]+>', '', rank_raw)
rank_raw = rank_raw.strip()
rank_raw = rank_raw.replace("amp;","")
rank2bis_nb = rank_raw.split('in')[0]
rank2bis_cat = rank_raw.split('in ')[1]
except:
rank2bis_nb = "NA"
rank2bis_cat = "NA"
我在使用一个小的抓取脚本从 Amazon 页面检索信息时遇到了一些困难。下面是我的代码:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import random
import time
from bs4 import BeautifulSoup
sleep_time_min = 5
sleep_time_max = 10
### INFORMATION FOR PROXY, UA & INFO ROTATION ###
user_agent_list = ['Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13']
opts = Options()
user_agent = random.choice(user_agent_list)
opts.add_argument("user-agent="+user_agent)
driver = webdriver.Chrome(executable_path='XXXXXXXX', options=opts)
driver.get('https://www.amazon.com/gp/product/B00J4B0S4O')
soup = BeautifulSoup(driver.page_source, 'lxml')
sleep_time_range = range(sleep_time_min,sleep_time_max)
sleep_time = random.choice(sleep_time_range)
time.sleep(sleep_time)
#Extract seller rank & sales category
try:
rank = driver.find_element_by_xpath('//div[@id="detailBullets_feature_div"]/ul[@class="a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list"]/li/span[@class="a-list-item"]/span[@class="a-text-bold"]')
#rank = driver.find_element_by_xpath('//div[@id="detail-bullets_feature_div"]').text
#rank = driver.find_element_by_xpath('//div[@id="a-page"]').text
except:
rank = "NA"
print(rank)
driver.close()
所以基本上我试图在页面上检索以下信息:
Best Sellers Rank: #711 in Grocery & Gourmet Food (See Top 100 in Grocery & Gourmet Food)
#1 in Grapeseed Oils
然后将其切片,以便我可以将排名和类别存储到变量中。
这是我的问题,尽管我尽了一切努力,我的 xpath 仍然返回空结果。我已经离开了我在代码中使用的其他 xpath(即高级 div,但不幸的是结果是一样的)。
不幸的是,我不明白为什么这些结果仍然是空的。你有什么想法吗?
非常感谢您的帮助
我最终以丑陋的方式完成了它,不是很漂亮但是工作:)
try:
rank_main = soup.find_all('span','a-list-item')
rank_main = str(rank_main)
rank_main = rank_main.split("(<a href")[0]
rank_main = rank_main.split("#")[1]
rank_main = rank_main.replace("amp;","")
rank1bis_nb = rank_main.split('in')[0]
rank1bis_cat = rank_main.split('in ')[1]
except:
rank1bis_nb = "NA"
rank1bis_cat = "NA"
try:
rank_raw = soup.find_all('ul','a-unordered-list a-nostyle a-vertical zg_hrsr')
rank_raw = str(rank_raw)
rank_raw = rank_raw.replace("[","")
rank_raw = rank_raw.replace("]","")
rank_raw = rank_raw.replace("#","")
rank_raw = re.sub('<[^>]+>', '', rank_raw)
rank_raw = rank_raw.strip()
rank_raw = rank_raw.replace("amp;","")
rank2bis_nb = rank_raw.split('in')[0]
rank2bis_cat = rank_raw.split('in ')[1]
except:
rank2bis_nb = "NA"
rank2bis_cat = "NA"