重复的数据抓取工具 json api

duplicated data scraper json api

我有这个脚本:

import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
    
    
if os.path.exists('jfs_hombre.csv'):
    os.remove('jfs_hombre.csv')
    print("The file has been deleted successfully")
else:
    print("The file does not exist!")
    
    
class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'

    #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]

    custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} 

    def start_requests(self):
    
        yield scrapy.Request(
            url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
            callback=self.parse,
            method="GET"
        )
    
    def parse(self, response):
        resp = response.json()
        #print(resp)
        for item in range(0,576,32):
            resp['recordsFiltered']=item
           
            for result  in resp['data']['productSearch']['products']:
                yield {
                    'Casa':'Just_For_Sports',
                    'Sku' :result['productReference'],
                    'Name': result['productName'],
                    'precio': result['priceRange']['sellingPrice']['highPrice'],
                    'Link': 'https://www.justforsport.com.ar' + result['link'],
                    'Date':datetime.today().strftime('%Y-%m-%d')
                    }
    
    
  

if __name__ == "__main__":
    process =CrawlerProcess()
    process.crawl(JfsSpider_hombre)
    process.start()

它工作正常并获得 576 行,但问题是它们是重复的。当我删除重复的数据时,我只得到 32 个唯一值,我想我只从一页获取值(每页 32 个产品)我如何遍历所有我认为与行有关的元素:

for item in range(0,576,32):

提前致谢

您使用的 'Casa':'Just_For_Sports', 不正确,应该是 result['Just_For_Sports'],但最重要的是您从哪里得到 "Just_For_Sports"。我没有在产品列表中找到它。实际上,您不能包含 products 中不存在的密钥。 'Date':datetime.today().strftime('%Y-%m-%d') 您也不会在产品列表中找到密钥。现在你可以试试是否存在复制值

import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
    
    
if os.path.exists('jfs_hombre.csv'):
    os.remove('jfs_hombre.csv')
    print("The file has been deleted successfully")
else:
    print("The file does not exist!")
    
    
class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'

    #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]

    custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} 

    def start_requests(self):
        headers = {"content-type": "application/json"}
        yield scrapy.Request(
            url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
            callback=self.parse,
            method="GET",
            headers=headers,
            dont_filter=True
        )
    
    def parse(self, response):
        resp = response.json()
        #print(resp)
        for item in range(0,576,32):
            resp['data']['productSearch']['recordsFiltered']=item
           
            for result  in resp['data']['productSearch']['products']:
                yield {
                    #'Casa':'Just_For_Sports',
                    'Sku' :result['productReference'],
                    'Name': result['productName'],
                    'precio': result['priceRange']['sellingPrice']['highPrice'],
                    'Link': 'https://www.justforsport.com.ar' + result['link'],
                    # 'Date':datetime.today().strftime('%Y-%m-%d')
                    }
    
    
if __name__ == "__main__":
    process =CrawlerProcess()
    
    process.crawl(JfsSpider_hombre)
    process.start()

由 set() 证明

import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
    
    
if os.path.exists('jfs_hombre.csv'):
    os.remove('jfs_hombre.csv')
    print("The file has been deleted successfully")
else:
    print("The file does not exist!")
    
 
class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'
    unique_data = set() 

    #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]

    custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} 

    def start_requests(self):
        headers = {"content-type": "application/json"}
        yield scrapy.Request(
            url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
            callback=self.parse,
            method="GET",
            headers=headers,
            dont_filter=True
        )
    
    def parse(self, response):
        resp = response.json()
        #print(resp)
        for item in range(0,576,32):
            resp['data']['productSearch']['recordsFiltered']=item
           
            for result  in resp['data']['productSearch']['products']:
                s=result['productReference']
                self.unique_data.add(s)
                yield {
                    #'Casa':'Just_For_Sports',
                    'Sku' :s,
                    'Name': result['productName'],
                    'precio': result['priceRange']['sellingPrice']['highPrice'],
                    'Link': 'https://www.justforsport.com.ar' + result['link'],
                    # 'Date':datetime.today().strftime('%Y-%m-%d')
                    }
    
    
if __name__ == "__main__":
    process =CrawlerProcess()
    
    process.crawl(JfsSpider_hombre)
    process.start()

输出:

'item_scraped_count': 576,