使用 Scrapy 无限滚动 Ajax 提取数据
Extract data with infinite scrolling Ajax with Scrapy
我是 python 和 scrapy 的初学者。
我需要帮助,我需要提取产品列表,但他的站点最后有一个 "view more" 产品,该产品使用 ajax 一个 text/html 请求执行并加载另一个 html 新的产品。
import scrapy
from scrapy.http import Request
class ProdSpider(scrapy.Spider):
name = "easy"
allowed_domains = ["easy.com.ar"]
start_urls = ["https://www.easy.com.ar/webapp/wcs/stores/servlet/es/easyar/search/AjaxCatalogSearchResultContentView?searchTermScope=&searchType=1002&filterTerm=&orderBy=&maxPrice=&showResultsPage=true&langId=-5&sType=SimpleSearch&metaData=&pageSize=12&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&categoryId=39652&storeId=10151&beginIndex=12"]
beginIndex_index = 12
def parse(self, response):
SECTION_SELECTOR = '.thumb-product'
for soar in response.css(SECTION_SELECTOR):
Link = 'div.dojoDndItem a ::attr(href)'
# Marca = 'p.brand a ::text'
Nombre = 'div.thumb-name a ::text'
# Link = 'p.brand a ::attr(href)'
# SKU = './/p[@class="sku"]/text()' #p.sku ::text'
Price = './/span[@id="tarj-mas-edit"]/text()' #["0"].parentElement.innerText .//span[@class="thumb-price-e"]/text()
yield {
'Link': soar.css(Link).extract_first(),
# 'Marca': soar.css(Marca).extract_first(),
'Nombre': soar.css(Nombre).re_first(r'\n\s*(.*)'), # Limpia espacios y caracteres especiales
# 'Link': soar.css(Link).extract_first(),
# 'SKU': soar.xpath(SKU).re_first(r'SKU:\s*(.*)'),
'Price': soar.xpath(Price).re_first(r'\n\s*(.*)'),
}
# here if no products are available , simply return, means exiting from
# parse and ultimately stops the spider
self.beginIndex_index += 12
if beginIndex_index:
yield Request(url="https://www.easy.com.ar/webapp/wcs/stores/servlet/es/easyar/search/AjaxCatalogSearchResultContentView?searchTermScope=&searchType=1002&filterTerm=&orderBy=&maxPrice=&showResultsPage=true&langId=-5&sType=SimpleSearch&metaData=&pageSize=12&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&categoryId=39652&storeId=10151&beginIndex=%s" % (self.beginIndex_index + 12),
callback=self.parse)
我尝试使用上面的代码,但只捕获了 12 个产品。 url 中唯一变化的参数是 "beginIndex=12",Y 想要将 +12 加到 url 直到产品列表结束。我被这个问题困住了!
谢谢!
我建议您使用 selenium,这样您就可以 'click' 在 查看更多 按钮上并在您的文件中加载更多数据蜘蛛。
这里有一个spider的例子(我没测试过,但是大致思路):
import scrapy
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
class ProdSpider(scrapy.Spider):
name = "easy"
allowed_domains = ["easy.com.ar"]
start_urls = ["https://www.easy.com.ar/webapp/wcs/stores/servlet/es/easyar/search/aditivos-y-lubricantes"]
def __init__(self):
super(ProdSpider, self).__init__()
binary = FirefoxBinary('C:/Program Files (x86)/Mozilla Firefox/firefox.exe')
self.wb = webdriver.Firefox(firefox_binary=binary)
def parse(self, response):
self.wb.get(response.url)
while True:
view_more_button = self.wb.find_element_by_xpath('//*[@id="Search_Result_div"]/div[2]/div[9]/input')
if not view_more_button:
break
view_more_button.click()
#extract your data here...
你明白了!
我在您的 URL 中看到您还有一个名为 pageSize
的参数。我测试了它,网站允许你将它设置为最大 50。
要知道何时停止,您可以在产生另一个请求之前测试 response.css(SECTION_SELECTOR)
中是否有项目:
import scrapy
from scrapy.http import Request
from scrapy import Selector
class ProdSpider(scrapy.Spider):
name = "easy"
allowed_domains = ["easy.com.ar"]
url = "https://www.easy.com.ar/webapp/wcs/stores/servlet/es/easyar/search/AjaxCatalogSearchResultContentView?searchTermScope=&searchType=1002&filterTerm=&orderBy=&maxPrice=&showResultsPage=true&langId=-5&sType=SimpleSearch&metaData=&pageSize=50&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&categoryId=39652&storeId=10151&beginIndex={pagenum}"
product_fields_xpath = {
'Link': '//a[contains(@id, "CatalogEntry")]/@href',
'Nombre': '//a[contains(@id, "CatalogEntry")]/text()',
'Price': './/span[@class="thumb-price-e"]/text()'
}
section_selector = '//div[@class="thumb-product"]'
begin_index = 0
def start_request(self):
yield Request(url=url.format(pagenum=self.begin_index), method='GET', callback=self.parse)
def parse(self, response):
products = response.xpath(self.section_selector).extract()
n_items = 0
for product in products:
n_items += 1
sel = Selector(text=product)
item = dict()
for k, v in self.product_fields_xpath.iteritems():
item[k] = sel.xpath(v).extract_first()
yield item
self.begin_index += 50
if n_items > 0:
yield Request(url=url.format(pagenum=self.begin_index), method='GET', callback=self.parse)
我没有测试这段代码,但我希望你能明白我的意思。
我是 python 和 scrapy 的初学者。 我需要帮助,我需要提取产品列表,但他的站点最后有一个 "view more" 产品,该产品使用 ajax 一个 text/html 请求执行并加载另一个 html 新的产品。
import scrapy
from scrapy.http import Request
class ProdSpider(scrapy.Spider):
name = "easy"
allowed_domains = ["easy.com.ar"]
start_urls = ["https://www.easy.com.ar/webapp/wcs/stores/servlet/es/easyar/search/AjaxCatalogSearchResultContentView?searchTermScope=&searchType=1002&filterTerm=&orderBy=&maxPrice=&showResultsPage=true&langId=-5&sType=SimpleSearch&metaData=&pageSize=12&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&categoryId=39652&storeId=10151&beginIndex=12"]
beginIndex_index = 12
def parse(self, response):
SECTION_SELECTOR = '.thumb-product'
for soar in response.css(SECTION_SELECTOR):
Link = 'div.dojoDndItem a ::attr(href)'
# Marca = 'p.brand a ::text'
Nombre = 'div.thumb-name a ::text'
# Link = 'p.brand a ::attr(href)'
# SKU = './/p[@class="sku"]/text()' #p.sku ::text'
Price = './/span[@id="tarj-mas-edit"]/text()' #["0"].parentElement.innerText .//span[@class="thumb-price-e"]/text()
yield {
'Link': soar.css(Link).extract_first(),
# 'Marca': soar.css(Marca).extract_first(),
'Nombre': soar.css(Nombre).re_first(r'\n\s*(.*)'), # Limpia espacios y caracteres especiales
# 'Link': soar.css(Link).extract_first(),
# 'SKU': soar.xpath(SKU).re_first(r'SKU:\s*(.*)'),
'Price': soar.xpath(Price).re_first(r'\n\s*(.*)'),
}
# here if no products are available , simply return, means exiting from
# parse and ultimately stops the spider
self.beginIndex_index += 12
if beginIndex_index:
yield Request(url="https://www.easy.com.ar/webapp/wcs/stores/servlet/es/easyar/search/AjaxCatalogSearchResultContentView?searchTermScope=&searchType=1002&filterTerm=&orderBy=&maxPrice=&showResultsPage=true&langId=-5&sType=SimpleSearch&metaData=&pageSize=12&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&categoryId=39652&storeId=10151&beginIndex=%s" % (self.beginIndex_index + 12),
callback=self.parse)
我尝试使用上面的代码,但只捕获了 12 个产品。 url 中唯一变化的参数是 "beginIndex=12",Y 想要将 +12 加到 url 直到产品列表结束。我被这个问题困住了!
谢谢!
我建议您使用 selenium,这样您就可以 'click' 在 查看更多 按钮上并在您的文件中加载更多数据蜘蛛。 这里有一个spider的例子(我没测试过,但是大致思路):
import scrapy
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
class ProdSpider(scrapy.Spider):
name = "easy"
allowed_domains = ["easy.com.ar"]
start_urls = ["https://www.easy.com.ar/webapp/wcs/stores/servlet/es/easyar/search/aditivos-y-lubricantes"]
def __init__(self):
super(ProdSpider, self).__init__()
binary = FirefoxBinary('C:/Program Files (x86)/Mozilla Firefox/firefox.exe')
self.wb = webdriver.Firefox(firefox_binary=binary)
def parse(self, response):
self.wb.get(response.url)
while True:
view_more_button = self.wb.find_element_by_xpath('//*[@id="Search_Result_div"]/div[2]/div[9]/input')
if not view_more_button:
break
view_more_button.click()
#extract your data here...
你明白了!
我在您的 URL 中看到您还有一个名为 pageSize
的参数。我测试了它,网站允许你将它设置为最大 50。
要知道何时停止,您可以在产生另一个请求之前测试 response.css(SECTION_SELECTOR)
中是否有项目:
import scrapy
from scrapy.http import Request
from scrapy import Selector
class ProdSpider(scrapy.Spider):
name = "easy"
allowed_domains = ["easy.com.ar"]
url = "https://www.easy.com.ar/webapp/wcs/stores/servlet/es/easyar/search/AjaxCatalogSearchResultContentView?searchTermScope=&searchType=1002&filterTerm=&orderBy=&maxPrice=&showResultsPage=true&langId=-5&sType=SimpleSearch&metaData=&pageSize=50&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&categoryId=39652&storeId=10151&beginIndex={pagenum}"
product_fields_xpath = {
'Link': '//a[contains(@id, "CatalogEntry")]/@href',
'Nombre': '//a[contains(@id, "CatalogEntry")]/text()',
'Price': './/span[@class="thumb-price-e"]/text()'
}
section_selector = '//div[@class="thumb-product"]'
begin_index = 0
def start_request(self):
yield Request(url=url.format(pagenum=self.begin_index), method='GET', callback=self.parse)
def parse(self, response):
products = response.xpath(self.section_selector).extract()
n_items = 0
for product in products:
n_items += 1
sel = Selector(text=product)
item = dict()
for k, v in self.product_fields_xpath.iteritems():
item[k] = sel.xpath(v).extract_first()
yield item
self.begin_index += 50
if n_items > 0:
yield Request(url=url.format(pagenum=self.begin_index), method='GET', callback=self.parse)
我没有测试这段代码,但我希望你能明白我的意思。