如何从 scrapy 运行中获取统计数据?
How to get stats from a scrapy run?
我正在按照 scrapy 文档中的示例从外部文件运行 scrapy spider。我想抓取 Core API 提供的统计数据,并在抓取完成后将其存储到 mysql table。
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from test.spiders.myspider import *
from scrapy.utils.project import get_project_settings
from test.pipelines import MySQLStorePipeline
import datetime
spider = MySpider()
def run_spider(spider):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
mysql_insert = MySQLStorePipeline()
mysql_insert.cursor.execute(
'insert into crawler_stats(sites_id, start_time,end_time,page_scraped,finish_reason)
values(%s,%s,%s, %s,%s)',
(1,datetime.datetime.now(),datetime.datetime.now(),100,'test'))
mysql_insert.conn.commit()
run_spider(spider)
如何获取上面代码中 start_time、end_time、pages_scraped、finish_reason 等统计数据的值?
从crawler.stats
collector获取它们:
stats = crawler.stats.get_stats()
示例代码(在 spider_closed
信号处理程序中收集统计信息):
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # stats is a dictionary
# write stats to the database here
reactor.stop()
def run_spider(spider):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(callback, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
run_spider(spider)
我正在按照 scrapy 文档中的示例从外部文件运行 scrapy spider。我想抓取 Core API 提供的统计数据,并在抓取完成后将其存储到 mysql table。
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from test.spiders.myspider import *
from scrapy.utils.project import get_project_settings
from test.pipelines import MySQLStorePipeline
import datetime
spider = MySpider()
def run_spider(spider):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
mysql_insert = MySQLStorePipeline()
mysql_insert.cursor.execute(
'insert into crawler_stats(sites_id, start_time,end_time,page_scraped,finish_reason)
values(%s,%s,%s, %s,%s)',
(1,datetime.datetime.now(),datetime.datetime.now(),100,'test'))
mysql_insert.conn.commit()
run_spider(spider)
如何获取上面代码中 start_time、end_time、pages_scraped、finish_reason 等统计数据的值?
从crawler.stats
collector获取它们:
stats = crawler.stats.get_stats()
示例代码(在 spider_closed
信号处理程序中收集统计信息):
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # stats is a dictionary
# write stats to the database here
reactor.stop()
def run_spider(spider):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(callback, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
run_spider(spider)