Scraper 没有获取全部数据

Scraper not getting total data

我有一个 .py 抓取器,当它 运行s 时,工作正常但没有获得 100% 的数据。我收到很多这样的错误:

2022-05-05 20:53:39 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/buzo-hombre-361-degrees-y2201my002a-urban-1-gris/p> (referer: https://www.justforsport.com.ar/hombre?page=3)
Traceback (most recent call last):
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
    yield next(it)
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
    return next(self.data)
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
    return next(self.data)
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
    for x in result:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_hombre.py", line 41, in parse_article_detail
    precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
    o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range

这是我的脚本:

import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os

if os.path.exists('jfs_hombre.csv'):
    os.remove('jfs_hombre.csv')
    print("The file has been deleted successfully")
else:
    print("The file does not exist!")

class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'
    start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
        

    def parse(self,response):
        total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/27) + 1
        for count in range(1, total_products):
            yield SplashRequest(url=f'https://www.justforsport.com.ar/hombre?page={count}',
                          callback=self.parse_links)

  
    def parse_links(self,response):
        links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()  
        for link in links:
            yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail)
        
      
    def parse_article_detail(self, response):
        precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
        yield {
            'Casa':'Just_For_Sports',
            'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
            'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
            'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
            'Link':response.url,
            'Date':datetime.today().strftime('%Y-%m-%d')
                 }

process= CrawlerProcess(
    settings = { 
        'FEED_URI':'jfs_hombre.csv' ,
        'FEED_FORMAT': 'csv',
        'FEED_EXPORT_ENCODING':'utf-8',
        'CONCURRENT_REQUESTS': 16,
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_MAX_DELAY' : 2,
        'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
        } )        
            
process.crawl(JfsSpider_hombre)
process.start()

我不明白错误是什么...为什么有时我得到 100% 的信息而有时我得到这些消息?它与脚本有关,user_agent,关于进程 运行?

的那一刻

提前致谢!

数据也是从 API 调用 json 响应作为 GET 方法生成的,你可以用最简单和超快速的方式调用任何你想要的获取所有数据点。所以下面给出了一个工作解决方案的例子。

import scrapy
from scrapy.crawler import CrawlerProcess

class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'
    #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
        
    def start_requests(self):

        yield scrapy.Request(
            url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
            callback=self.parse,
            method="GET"
        )

    def parse(self, response):
        resp = response.json()
        #print(resp)
        for item in range(0,576,32):
            resp['recordsFiltered']=item
       
            for result  in resp['data']['productSearch']['products']:
                yield {
                    'productName': result['productName']
                }
if __name__ == "__main__":
    process =CrawlerProcess()
    process.crawl()
    process.start()

输出:

'downloader/response_status_count/200': 1,
 'item_scraped_count': 576,