如何使用 scrapy-selenium 加载 more/show 更多分页
How to Load more/show more pagination with scrapy-selenium
收到回复但没有抓取任何内容!
import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
class ProductSpider(scrapy.Spider):
name = "card"
allowed_domains = ['moneyfacts.co.uk']
start_urls = ['https://moneyfacts.co.uk/credit-cards/balance-transfer-credit-cards/?fbclid=IwAR05-Sa1hIcYTRx8DXYYQd0UfDRjWF-jD2-u51jiLP-WKlkxSddKjzUcnWA']
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
actions = ActionChains(self.driver)
while True:
next = self.driver.find_elements_by_css_selector("button#show-more")
if next:
last_height = self.driver.execute_script("return document.body.scrollHeight")
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
actions.move_to_element(next[0]).click().perform()
lists= Selector(text=self.driver.page_source)
for list in lists.xpath('//ul[@id="finder-table"]/li'):
yield{
'Name': list.xpath('.//*[@class="table-item-heading-product-name"]/span/strong/text()').get(),
'Title': list.xpath('.//*[@class="table-item-heading-product-name"]/span/text()').get()
}
else:
break
self.driver.close()
我想您需要滚动到“显示更多”按钮才能单击它,因为在您向下滚动屏幕之前它不在屏幕的可视区域中。
此外,最好根据 class 名称而不是文本来定位元素。
此外,如果没有更多的“显示更多”按钮,您的代码将抛出异常。所以我用 find_elements
而不是你写的来获取元素列表。这不会抛出异常。如果没有找到任何元素,它将 return 一个空列表,您的代码将正常退出。如果找到元素,您将使用 returned 列表中的第一个元素。
这就是我最终重新构建您的代码的结果:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
class ProductSpider(scrapy.Spider):
name = "card"
allowed_domains = ['moneyfacts.co.uk']
start_urls = ['https://moneyfacts.co.uk/credit-cards/balance-transfer-credit-cards/?fbclid=IwAR05-Sa1hIcYTRx8DXYYQd0UfDRjWF-jD2-u51jiLP-WKlkxSddKjzUcnWA']
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
actions = ActionChains(self.driver)
while True:
next = driver.find_elements_by_css_selector("button#show-more")
if next:
last_height = driver.execute_script("return document.body.scrollHeight")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
actions.move_to_element(next[0]).click().perform()
lists = self.driver.find_elements_by_xpath(
'//ul[@id="finder-table"]/li')
for list in lists:
yield{
'Name': list.xpath('.//*[@class="table-item-heading-product-name"]/span/strong/text()').get(),
'Title': list.xpath('.//*[@class="table-item-heading-product-name"]/span/text()').get()
}
else:
break
self.driver.close()
收到回复但没有抓取任何内容!
import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
class ProductSpider(scrapy.Spider):
name = "card"
allowed_domains = ['moneyfacts.co.uk']
start_urls = ['https://moneyfacts.co.uk/credit-cards/balance-transfer-credit-cards/?fbclid=IwAR05-Sa1hIcYTRx8DXYYQd0UfDRjWF-jD2-u51jiLP-WKlkxSddKjzUcnWA']
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
actions = ActionChains(self.driver)
while True:
next = self.driver.find_elements_by_css_selector("button#show-more")
if next:
last_height = self.driver.execute_script("return document.body.scrollHeight")
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
actions.move_to_element(next[0]).click().perform()
lists= Selector(text=self.driver.page_source)
for list in lists.xpath('//ul[@id="finder-table"]/li'):
yield{
'Name': list.xpath('.//*[@class="table-item-heading-product-name"]/span/strong/text()').get(),
'Title': list.xpath('.//*[@class="table-item-heading-product-name"]/span/text()').get()
}
else:
break
self.driver.close()
我想您需要滚动到“显示更多”按钮才能单击它,因为在您向下滚动屏幕之前它不在屏幕的可视区域中。
此外,最好根据 class 名称而不是文本来定位元素。
此外,如果没有更多的“显示更多”按钮,您的代码将抛出异常。所以我用 find_elements
而不是你写的来获取元素列表。这不会抛出异常。如果没有找到任何元素,它将 return 一个空列表,您的代码将正常退出。如果找到元素,您将使用 returned 列表中的第一个元素。
这就是我最终重新构建您的代码的结果:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
class ProductSpider(scrapy.Spider):
name = "card"
allowed_domains = ['moneyfacts.co.uk']
start_urls = ['https://moneyfacts.co.uk/credit-cards/balance-transfer-credit-cards/?fbclid=IwAR05-Sa1hIcYTRx8DXYYQd0UfDRjWF-jD2-u51jiLP-WKlkxSddKjzUcnWA']
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
actions = ActionChains(self.driver)
while True:
next = driver.find_elements_by_css_selector("button#show-more")
if next:
last_height = driver.execute_script("return document.body.scrollHeight")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
actions.move_to_element(next[0]).click().perform()
lists = self.driver.find_elements_by_xpath(
'//ul[@id="finder-table"]/li')
for list in lists:
yield{
'Name': list.xpath('.//*[@class="table-item-heading-product-name"]/span/strong/text()').get(),
'Title': list.xpath('.//*[@class="table-item-heading-product-name"]/span/text()').get()
}
else:
break
self.driver.close()