无法从 Scrapy 统计字典中获取价值
Can't get value from Scrapy stats dictionary
我的 scrapy 中有这个管道,我需要从 Scrapy 统计信息中获取信息
class MyPipeline(object):
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def process_item(self, item, spider):
print self.stats.get_stats()['item_scraped_count']
return item
当我 运行 代码时,我得到这个错误
Traceback (most recent call last):
File "D:\Kerja\HIT\PYTHON~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\defer.py", line 649, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "D:\Kerja\HIT\Python Projects\<project_name>\<project_name>\<project_name>\<project_name>\pipelines.py", line 35, in process_item
print self.stats.get_stats()['item_scraped_count']
KeyError: 'item_scraped_count'
如果这不是获取统计值的正确方法,那我该怎么办?
找到答案了!终于!
而不是 self.stats.get_stats()['item_scraped_count']
使用 self.stats.get_value('item_scraped_count')
固定版本:
使用 Scrapy 1.8
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import CsvItemExporter
import datetime
class SplitFilePipeline(object):
def __init__(self, stats):
self.stats = stats
self.base_filename = "crawls/output_{}.csv"
self.next_split = self.split_limit = 10000 # assuming you want to split 50000 items/csv
self.create_exporter()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def create_exporter(self):
now = datetime.datetime.now()
datetime_stamp = now.strftime("%Y%m%d%H%M")
self.file = open(self.base_filename.format(datetime_stamp),'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def process_item(self, item, spider):
if (self.stats.get_value('item_scraped_count') or 0) >= self.next_split:
self.next_split += self.split_limit
self.exporter.finish_exporting()
self.file.close()
self.create_exporter()
self.exporter.export_item(item)
return item
我的 scrapy 中有这个管道,我需要从 Scrapy 统计信息中获取信息
class MyPipeline(object):
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def process_item(self, item, spider):
print self.stats.get_stats()['item_scraped_count']
return item
当我 运行 代码时,我得到这个错误
Traceback (most recent call last):
File "D:\Kerja\HIT\PYTHON~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\defer.py", line 649, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "D:\Kerja\HIT\Python Projects\<project_name>\<project_name>\<project_name>\<project_name>\pipelines.py", line 35, in process_item
print self.stats.get_stats()['item_scraped_count']
KeyError: 'item_scraped_count'
如果这不是获取统计值的正确方法,那我该怎么办?
找到答案了!终于!
而不是 self.stats.get_stats()['item_scraped_count']
使用 self.stats.get_value('item_scraped_count')
固定版本: 使用 Scrapy 1.8
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import CsvItemExporter
import datetime
class SplitFilePipeline(object):
def __init__(self, stats):
self.stats = stats
self.base_filename = "crawls/output_{}.csv"
self.next_split = self.split_limit = 10000 # assuming you want to split 50000 items/csv
self.create_exporter()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def create_exporter(self):
now = datetime.datetime.now()
datetime_stamp = now.strftime("%Y%m%d%H%M")
self.file = open(self.base_filename.format(datetime_stamp),'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def process_item(self, item, spider):
if (self.stats.get_value('item_scraped_count') or 0) >= self.next_split:
self.next_split += self.split_limit
self.exporter.finish_exporting()
self.file.close()
self.create_exporter()
self.exporter.export_item(item)
return item