无法从 Scrapy 统计字典中获取价值

Can't get value from Scrapy stats dictionary

我的 scrapy 中有这个管道,我需要从 Scrapy 统计信息中获取信息

class MyPipeline(object):

    def __init__(self, stats):
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.stats)

    def process_item(self, item, spider):           
        print self.stats.get_stats()['item_scraped_count']
        return item

当我 运行 代码时,我得到这个错误

Traceback (most recent call last):
  File "D:\Kerja\HIT\PYTHON~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\defer.py", line 649, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "D:\Kerja\HIT\Python Projects\<project_name>\<project_name>\<project_name>\<project_name>\pipelines.py", line 35, in process_item
    print self.stats.get_stats()['item_scraped_count']
KeyError: 'item_scraped_count'

如果这不是获取统计值的正确方法,那我该怎么办?

找到答案了!终于!

而不是 self.stats.get_stats()['item_scraped_count'] 使用 self.stats.get_value('item_scraped_count')

https://doc.scrapy.org/en/latest/topics/stats.html

固定版本: 使用 Scrapy 1.8

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import CsvItemExporter

import datetime


class SplitFilePipeline(object):

    def __init__(self, stats):
        self.stats = stats
        self.base_filename = "crawls/output_{}.csv"
        self.next_split = self.split_limit = 10000 # assuming you want to split 50000 items/csv
        self.create_exporter()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.stats)

    def create_exporter(self):
        now = datetime.datetime.now()
        datetime_stamp = now.strftime("%Y%m%d%H%M")
        self.file = open(self.base_filename.format(datetime_stamp),'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        if (self.stats.get_value('item_scraped_count') or 0) >= self.next_split:
            self.next_split += self.split_limit
            self.exporter.finish_exporting()
            self.file.close()
            self.create_exporter()
        self.exporter.export_item(item)
        return item