如何读取中间件的所有日志?
How can I read all logs at middleware?
我的服务器上有大约 100 个蜘蛛。每天早上,所有蜘蛛都开始抓取所有日志并将其写入日志中。有时其中几个会给我一个错误。当蜘蛛给我一个错误时,我必须去服务器并从日志文件中读取,但我想从邮件中读取日志。
我已经将动态邮件发件人设置如下:
class FirstBotSpiderMiddleware:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler.stats)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def spider_closed(self, spider,reason):
error_count = self.stats.get_value('log_count/ERROR')
counts = self.stats.get_value('item_scraped_count')
count_403 = self.stats.get_value('downloader/response_status_count/403')
count_404 = self.stats.get_value('downloader/response_status_count/404')
robots_404 = self.stats.get_value('robotstxt/response_status_count/404')
robots_403 = self.stats.get_value('robotstxt/response_status_count/403')
duplicate_count = self.stats.get_value('item_dropped_count')
#I want to read all logs here
content = "some stat string"
self.mailSender(spider.name,content,logs)
def mailSender(self,spider,content,logs):
send_mail(
"Scrapy "+spider+" done",
content,
djsettings.EMAIL_HOST_USER,
['xxx@xxx.com'],
)
我不知道如何在中间件上动态读取 spider_closed 处的错误日志。你有什么建议吗?
我在我的网络抓取模块中实现了类似的方法。
以下是您可以查看和参考的实现。
import gzip
import datetime
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from io import StringIO
def format_size(size):
for x in ['bytes', 'KB', 'MB', 'GB']:
if size < 1024.0:
return "%3.1f %s" % (size, x)
size /= 1024.0
class GzipCompressor(gzip.GzipFile):
extension = '.gz'
mimetype = 'application/gzip'
def __init__(self):
super(GzipCompressor, self).__init__(
fileobj=PlainCompressor(), mode='w')
self.read = self.fileobj.read
class PlainCompressor(StringIO):
extension = ''
mimetype = 'text/plain'
def read(self, *args, **kwargs):
self.seek(0)
return StringIO.read(self, *args, **kwargs)
@property
def size(self):
return len(self.getvalue())
class StatusMailer(object):
def __init__(self, recipients, mail, compressor, crawler):
self.recipients = recipients
self.mail = mail
self.encoder = ScrapyJSONEncoder()
self.files = defaultdict(compressor)
self.num_items = 0
self.num_errors = 0
self.start_time = datetime.datetime.now()
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
if not compression:
compressor = PlainCompressor
elif compression.lower().startswith('gz'):
compressor = GzipCompressor
else:
raise NotConfigured
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, compressor, crawler)
crawler.signals.connect(instance.item_scraped,
signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error,
signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed,
signal=signals.spider_closed)
return instance
def item_scraped(self, item, response, spider):
self.num_items += 1
self.files[spider.name + '.log'].write(str(self.num_items)+ " " + str(response.url) + '\n')
self.files[spider.name +
'-items.json'].write(self.encoder.encode(item))
def spider_error(self, failure, response, spider):
self.files[spider.name + '.log'].write(failure.getTraceback())
self.num_errors += 1
def spider_closed(self, spider, reason):
files = []
for name, compressed in self.files.items():
files.append((name + compressed.extension,
compressed.mimetype, compressed))
try:
size = self.files[spider.name + '-items.json'].size
except KeyError:
size = 0
body = '''Crawl statistics:
- Spider name: {0}
- Spider started at: {1}
- Spider finished at: {2}
- Number of items scraped: {3}
- Number of errors: {4}
- Size of scraped items: {5}'''.format(
spider.name,
self.start_time,
datetime.datetime.now(),
self.num_items,
self.num_errors,
format_size(size)
)
return self.mail.send(
to=self.recipients,
subject='Crawler for %s: %s' % (spider.name, reason),
body=body,
attachs=files
)
我的服务器上有大约 100 个蜘蛛。每天早上,所有蜘蛛都开始抓取所有日志并将其写入日志中。有时其中几个会给我一个错误。当蜘蛛给我一个错误时,我必须去服务器并从日志文件中读取,但我想从邮件中读取日志。
我已经将动态邮件发件人设置如下:
class FirstBotSpiderMiddleware:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler.stats)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def spider_closed(self, spider,reason):
error_count = self.stats.get_value('log_count/ERROR')
counts = self.stats.get_value('item_scraped_count')
count_403 = self.stats.get_value('downloader/response_status_count/403')
count_404 = self.stats.get_value('downloader/response_status_count/404')
robots_404 = self.stats.get_value('robotstxt/response_status_count/404')
robots_403 = self.stats.get_value('robotstxt/response_status_count/403')
duplicate_count = self.stats.get_value('item_dropped_count')
#I want to read all logs here
content = "some stat string"
self.mailSender(spider.name,content,logs)
def mailSender(self,spider,content,logs):
send_mail(
"Scrapy "+spider+" done",
content,
djsettings.EMAIL_HOST_USER,
['xxx@xxx.com'],
)
我不知道如何在中间件上动态读取 spider_closed 处的错误日志。你有什么建议吗?
我在我的网络抓取模块中实现了类似的方法。
以下是您可以查看和参考的实现。
import gzip
import datetime
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from io import StringIO
def format_size(size):
for x in ['bytes', 'KB', 'MB', 'GB']:
if size < 1024.0:
return "%3.1f %s" % (size, x)
size /= 1024.0
class GzipCompressor(gzip.GzipFile):
extension = '.gz'
mimetype = 'application/gzip'
def __init__(self):
super(GzipCompressor, self).__init__(
fileobj=PlainCompressor(), mode='w')
self.read = self.fileobj.read
class PlainCompressor(StringIO):
extension = ''
mimetype = 'text/plain'
def read(self, *args, **kwargs):
self.seek(0)
return StringIO.read(self, *args, **kwargs)
@property
def size(self):
return len(self.getvalue())
class StatusMailer(object):
def __init__(self, recipients, mail, compressor, crawler):
self.recipients = recipients
self.mail = mail
self.encoder = ScrapyJSONEncoder()
self.files = defaultdict(compressor)
self.num_items = 0
self.num_errors = 0
self.start_time = datetime.datetime.now()
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
if not compression:
compressor = PlainCompressor
elif compression.lower().startswith('gz'):
compressor = GzipCompressor
else:
raise NotConfigured
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, compressor, crawler)
crawler.signals.connect(instance.item_scraped,
signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error,
signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed,
signal=signals.spider_closed)
return instance
def item_scraped(self, item, response, spider):
self.num_items += 1
self.files[spider.name + '.log'].write(str(self.num_items)+ " " + str(response.url) + '\n')
self.files[spider.name +
'-items.json'].write(self.encoder.encode(item))
def spider_error(self, failure, response, spider):
self.files[spider.name + '.log'].write(failure.getTraceback())
self.num_errors += 1
def spider_closed(self, spider, reason):
files = []
for name, compressed in self.files.items():
files.append((name + compressed.extension,
compressed.mimetype, compressed))
try:
size = self.files[spider.name + '-items.json'].size
except KeyError:
size = 0
body = '''Crawl statistics:
- Spider name: {0}
- Spider started at: {1}
- Spider finished at: {2}
- Number of items scraped: {3}
- Number of errors: {4}
- Size of scraped items: {5}'''.format(
spider.name,
self.start_time,
datetime.datetime.now(),
self.num_items,
self.num_errors,
format_size(size)
)
return self.mail.send(
to=self.recipients,
subject='Crawler for %s: %s' % (spider.name, reason),
body=body,
attachs=files
)