Scrapy 蜘蛛扩展无法记录数据库管道错误
Scrapy spider extension fails to log DB pipeline errors
我正在尝试向我的 scrapy 扩展程序添加一个信号,以便在 spider_error
信号出现错误时向我发送电子邮件。即使管道中存在错误,蜘蛛信号似乎也没有记录这些错误,或者在项目被抓取并进入管道后蜘蛛不负责?有没有办法可以从扩展中记录这些?这是我的扩展程序的代码,它收集数据库中每个蜘蛛的统计信息,接下来我试图通过电子邮件发送错误,其信号似乎没有触发:
class StatsCollectorExtension(object):
def __init__(self, stats):
self.stats = stats
self.num_errors = 0
self.errors = []
@classmethod
def from_crawler(cls, crawler):
ext = cls(crawler.stats)
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
return ext
def spider_closed(self, spider):
"""
When the spider closes then
store the stats(start time, end time, items scraped,
pages crawled) into the database for each scraper.
Also send the errors through email if any.
"""
start_time = self.stats._stats['start_time']
finish_time = self.stats._stats['finish_time']
items_scraped_count = self.stats._stats['item_scraped_count']
spider_name = spider.name
pages_crawled_count = self.stats._stats['downloader/request_method_count/GET']
# add the scrapy stats to DB via SQL Alchemy object
stats = ScrapyStats(scrapername=spider_name,
start_time=start_time,
finish_time=finish_time,
items_scraped=items_scraped_count,
pages_crawled=pages_crawled_count)
db_session.add(stats)
db_session.commit()
if self.num_errors:
# Mandrill mail client that sends me an email
html = ''.join(self.errors)
subject = '%s errors found' % self.num_errors
send_mail(subject, from_email, from_name,
html, to_email, to_mail, mandrill_key)
def spider_error(self, failure, response, spider):
self.errors.append(failure.getTraceback())
self.num_errors += 1
还有堆栈跟踪
2015-01-08 13:13:20-0500 [ferc-staff-reports] ERROR: Error processing {'additional_documents': None,
'ekwhere': 'Fed',
'id': 'FERCaeff76181cc2bc14651c693d30300b99a7673219',
'publishdate': datetime.datetime(2013, 1, 30, 0, 0),
'title': 'The IV Formulation and Linear Approximations of the AC Optimal Power Flow Problem: Optimal Power Flow Paper 2',
'type': 'FERC Staff Reports & Papers - Staff Papers',
'url': u'http://www.ferc.gov/industries/electric/indus-act/market-planning/opf-papers/acopf-2-iv-linearization.pdf'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
d.callback(input)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback
self._startRunCallbacks(result)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/kiran/workspace/EK-source-scrapers/helpers/pipelines.py", line 88, in process_item
insert_item(item, spider.settings["table"])
File "/home/kiran/workspace/EK-source-scrapers/helpers/db_helper.py", line 54, in insert_item
db_session.commit()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/scoping.py", line 149, in do
return getattr(self.registry(), name)(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 765, in commit
self.transaction.commit()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 370, in commit
self._prepare_impl()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 350, in _prepare_impl
self.session.flush()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1879, in flush
self._flush(objects)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1997, in _flush
transaction.rollback(_capture_exception=True)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/util/langhelpers.py", line 57, in __exit__
compat.reraise(exc_type, exc_value, exc_tb)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1961, in _flush
flush_context.execute()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/unitofwork.py", line 370, in execute
rec.execute(self)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/unitofwork.py", line 523, in execute
uow
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/persistence.py", line 64, in save_obj
mapper, table, insert)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/persistence.py", line 562, in _emit_insert_statements
execute(statement, multiparams)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 717, in execute
return meth(self, multiparams, params)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/sql/elements.py", line 317, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 814, in _execute_clauseelement
compiled_sql, distilled_params
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 927, in _execute_context
context)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 1076, in _handle_dbapi_exception
exc_info
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/util/compat.py", line 185, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 920, in _execute_context
context)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/default.py", line 425, in do_execute
cursor.execute(statement, parameters)
File "/usr/lib/python2.7/dist-packages/MySQLdb/cursors.py", line 174, in execute
self.errorhandler(self, exc, value)
File "/usr/lib/python2.7/dist-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
sqlalchemy.exc.OperationalError: (OperationalError) (1054, "Unknown column 'additional_documents' in 'field list'") 'INSERT INTO sourceferc (id, title, url, type, publishdate, scrapedate, ekwhere, summary, docket_no, additional_documents) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' ('FERCaeff76181cc2bc14651c693d30300b99a7673219', 'The IV Formulation and Linear Approximations of the AC Optimal Power Flow Problem: Optimal Power Flow Paper 2', u'http://www.ferc.gov/industries/electric/indus-act/market-planning/opf-papers/acopf-2-iv-linearization.pdf', 'FERC Staff Reports & Papers - Staff Papers', datetime.datetime(2013, 1, 30, 0, 0), datetime.date(2015, 1, 8), 'Fed', None, None, None)
我认为这不是您可以使用内置蜘蛛信号捕捉到的东西,因为蜘蛛已经完成了它的工作而没有出现错误。稍后在处理项目并将其插入管道中的数据库时出现错误。
您应该考虑哪些选项:
- 修改管道并提高
DropItem("Database error %s" % error)
on error - in this case you can listen for item_dropped
signal
- 因为让数据库与您的 sqlalchemy 模型同步,严格来说,对于您的项目来说 prerequisite/requirement 运行 - 最好在初始化步骤检查它并且不允许蜘蛛如果有错误甚至开始爬行
希望这是有道理的。
我正在尝试向我的 scrapy 扩展程序添加一个信号,以便在 spider_error
信号出现错误时向我发送电子邮件。即使管道中存在错误,蜘蛛信号似乎也没有记录这些错误,或者在项目被抓取并进入管道后蜘蛛不负责?有没有办法可以从扩展中记录这些?这是我的扩展程序的代码,它收集数据库中每个蜘蛛的统计信息,接下来我试图通过电子邮件发送错误,其信号似乎没有触发:
class StatsCollectorExtension(object):
def __init__(self, stats):
self.stats = stats
self.num_errors = 0
self.errors = []
@classmethod
def from_crawler(cls, crawler):
ext = cls(crawler.stats)
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
return ext
def spider_closed(self, spider):
"""
When the spider closes then
store the stats(start time, end time, items scraped,
pages crawled) into the database for each scraper.
Also send the errors through email if any.
"""
start_time = self.stats._stats['start_time']
finish_time = self.stats._stats['finish_time']
items_scraped_count = self.stats._stats['item_scraped_count']
spider_name = spider.name
pages_crawled_count = self.stats._stats['downloader/request_method_count/GET']
# add the scrapy stats to DB via SQL Alchemy object
stats = ScrapyStats(scrapername=spider_name,
start_time=start_time,
finish_time=finish_time,
items_scraped=items_scraped_count,
pages_crawled=pages_crawled_count)
db_session.add(stats)
db_session.commit()
if self.num_errors:
# Mandrill mail client that sends me an email
html = ''.join(self.errors)
subject = '%s errors found' % self.num_errors
send_mail(subject, from_email, from_name,
html, to_email, to_mail, mandrill_key)
def spider_error(self, failure, response, spider):
self.errors.append(failure.getTraceback())
self.num_errors += 1
还有堆栈跟踪
2015-01-08 13:13:20-0500 [ferc-staff-reports] ERROR: Error processing {'additional_documents': None,
'ekwhere': 'Fed',
'id': 'FERCaeff76181cc2bc14651c693d30300b99a7673219',
'publishdate': datetime.datetime(2013, 1, 30, 0, 0),
'title': 'The IV Formulation and Linear Approximations of the AC Optimal Power Flow Problem: Optimal Power Flow Paper 2',
'type': 'FERC Staff Reports & Papers - Staff Papers',
'url': u'http://www.ferc.gov/industries/electric/indus-act/market-planning/opf-papers/acopf-2-iv-linearization.pdf'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
d.callback(input)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback
self._startRunCallbacks(result)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/kiran/workspace/EK-source-scrapers/helpers/pipelines.py", line 88, in process_item
insert_item(item, spider.settings["table"])
File "/home/kiran/workspace/EK-source-scrapers/helpers/db_helper.py", line 54, in insert_item
db_session.commit()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/scoping.py", line 149, in do
return getattr(self.registry(), name)(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 765, in commit
self.transaction.commit()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 370, in commit
self._prepare_impl()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 350, in _prepare_impl
self.session.flush()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1879, in flush
self._flush(objects)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1997, in _flush
transaction.rollback(_capture_exception=True)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/util/langhelpers.py", line 57, in __exit__
compat.reraise(exc_type, exc_value, exc_tb)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1961, in _flush
flush_context.execute()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/unitofwork.py", line 370, in execute
rec.execute(self)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/unitofwork.py", line 523, in execute
uow
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/persistence.py", line 64, in save_obj
mapper, table, insert)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/persistence.py", line 562, in _emit_insert_statements
execute(statement, multiparams)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 717, in execute
return meth(self, multiparams, params)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/sql/elements.py", line 317, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 814, in _execute_clauseelement
compiled_sql, distilled_params
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 927, in _execute_context
context)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 1076, in _handle_dbapi_exception
exc_info
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/util/compat.py", line 185, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 920, in _execute_context
context)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/default.py", line 425, in do_execute
cursor.execute(statement, parameters)
File "/usr/lib/python2.7/dist-packages/MySQLdb/cursors.py", line 174, in execute
self.errorhandler(self, exc, value)
File "/usr/lib/python2.7/dist-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
sqlalchemy.exc.OperationalError: (OperationalError) (1054, "Unknown column 'additional_documents' in 'field list'") 'INSERT INTO sourceferc (id, title, url, type, publishdate, scrapedate, ekwhere, summary, docket_no, additional_documents) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' ('FERCaeff76181cc2bc14651c693d30300b99a7673219', 'The IV Formulation and Linear Approximations of the AC Optimal Power Flow Problem: Optimal Power Flow Paper 2', u'http://www.ferc.gov/industries/electric/indus-act/market-planning/opf-papers/acopf-2-iv-linearization.pdf', 'FERC Staff Reports & Papers - Staff Papers', datetime.datetime(2013, 1, 30, 0, 0), datetime.date(2015, 1, 8), 'Fed', None, None, None)
我认为这不是您可以使用内置蜘蛛信号捕捉到的东西,因为蜘蛛已经完成了它的工作而没有出现错误。稍后在处理项目并将其插入管道中的数据库时出现错误。
您应该考虑哪些选项:
- 修改管道并提高
DropItem("Database error %s" % error)
on error - in this case you can listen foritem_dropped
signal - 因为让数据库与您的 sqlalchemy 模型同步,严格来说,对于您的项目来说 prerequisite/requirement 运行 - 最好在初始化步骤检查它并且不允许蜘蛛如果有错误甚至开始爬行
希望这是有道理的。