如何忽略 robots.txt 错误以显示在日志中?
How to ignore robots.txt errors to show in logs?
我正在研究爬虫,想通过遵守 robots.txt 来进行礼貌的爬行。由于它是一个广泛的爬网,日志文件变得更大并且更难处理,并且大多数日志记录是因为 robots.txt 在大多数站点中找不到。
所以我的问题是。有没有办法,我可以忽略 robots.txt 相关错误而不记录它们,因为我不需要知道我们是否找到它。
我已经有 errback 处理程序来处理我的爬虫的失败请求,但它不适用于 robots.txt,因为此请求是由 scrapy 中间件发出的
下面是我的代码:
蜘蛛:
class MySpider(scrapy.Spider):
name = 'mobile'
def start_requests(self):
urls = [
'https://site1.com',
'http://site2.com'
]
for url in urls:
safe_no = 'test'
yield scrapy.Request(url=url, callback=self.parse,
errback=self.handle_error, meta={'safe_no': safe_no})
def parse(self, response):
safe_no = response.meta['safe_no']
html_doc = response.body
text_data, contacts, keep_no = self.get_contact(html_doc, response.url)
# print(contacts,keep_no)
link_found = False
data = []
parsed_uri = urlparse(response.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
###Parse data and get contact....
if contacts:
yield{
'safe_no': safe_no,
'url': response.url,
'contacts': contacts,
# 'text_data': text_data
}
def handle_error(self, failure):
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError : "%s"', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
下面是爬虫的日志:
2019-01-10 15:33:36 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET http://www.site1.com/robots.txt>: DNS lookup failed: no results for hostname lookup: www.site1.com.
Traceback (most recent call last):
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\endpoints.py", line 975, in startConnectionAttempts
"no results for hostname lookup: {}".format(self._hostStr)
如您在日志中所见,handle_error 方法不适用于 /robot.txt URL 请求。
我做了一些研究,发现我们可以配置中间件来忽略一些错误,但到目前为止还没有成功。
这是对您的 handle_error
.
的一个小重构
def handle_error(self, failure):
# this is the original request
request = failure.request
if failure.check(DNSLookupError):
self.logger.error('DNSLookupError : "%s"', request.url)
elif request.url.endswith('/robots.txt'):
pass
elif failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(TimeoutError, TCPTimedOutError):
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
您的日志示例显示了一个 DNS 查找错误,恕我直言 应该 记录该错误,而不管具体的 URL 是什么(即使不是,它也会失败对于 robots.txt
,可能意味着应该跳过整个域。
万一其他人正在阅读这篇文章,我做的一个小技巧解决方案是采用基础 class 并注释掉正在打印的额外细节:
class MycrawlerRobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool("CUSTOM_ROBOTSTXT_OBEY"):
raise NotConfigured
self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
self.crawler = crawler
self._parsers = {}
self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b"")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get("dont_obey_robotstxt"):
return
d = maybeDeferred(self.robot_parser, request, spider)
d.addCallback(self.process_request_2, request, spider)
return d
def process_request_2(self, rp, request, spider):
if rp is None:
return
useragent = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b"User-Agent", self._default_useragent)
if not rp.allowed(request.url, useragent):
logger.debug(
"Forbidden by robots.txt: %(request)s",
{"request": request},
extra={"spider": spider},
)
self.crawler.stats.inc_value("robotstxt/forbidden")
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={"dont_obey_robotstxt": True},
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value("robotstxt/request_count")
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
else:
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
# if failure.type is not IgnoreRequest:
# logger.error(
# "Error downloading %(request)s: %(f_exception)s",
# {"request": request, "f_exception": failure.value},
# exc_info=failure_to_exc_info(failure),
# extra={"spider": spider},
# )
if failure.type is not IgnoreRequest:
logger.error(f"Error downloading robots.txt: {request}")
return failure
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value("robotstxt/response_count")
self.crawler.stats.inc_value(
f"robotstxt/response_status_count/{response.status}"
)
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
rp_dfd.callback(rp)
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = f"robotstxt/exception_count/{failure.type}"
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)
然后我将其添加到 settings.py
:
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Custom one written so it doesn't log every 404 response
CUSTOM_ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
...
"mycrawler.middlewares.MycrawlerRobotsTxtMiddleware": 100,
}
我正在研究爬虫,想通过遵守 robots.txt 来进行礼貌的爬行。由于它是一个广泛的爬网,日志文件变得更大并且更难处理,并且大多数日志记录是因为 robots.txt 在大多数站点中找不到。 所以我的问题是。有没有办法,我可以忽略 robots.txt 相关错误而不记录它们,因为我不需要知道我们是否找到它。
我已经有 errback 处理程序来处理我的爬虫的失败请求,但它不适用于 robots.txt,因为此请求是由 scrapy 中间件发出的 下面是我的代码: 蜘蛛:
class MySpider(scrapy.Spider):
name = 'mobile'
def start_requests(self):
urls = [
'https://site1.com',
'http://site2.com'
]
for url in urls:
safe_no = 'test'
yield scrapy.Request(url=url, callback=self.parse,
errback=self.handle_error, meta={'safe_no': safe_no})
def parse(self, response):
safe_no = response.meta['safe_no']
html_doc = response.body
text_data, contacts, keep_no = self.get_contact(html_doc, response.url)
# print(contacts,keep_no)
link_found = False
data = []
parsed_uri = urlparse(response.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
###Parse data and get contact....
if contacts:
yield{
'safe_no': safe_no,
'url': response.url,
'contacts': contacts,
# 'text_data': text_data
}
def handle_error(self, failure):
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError : "%s"', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
下面是爬虫的日志:
2019-01-10 15:33:36 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET http://www.site1.com/robots.txt>: DNS lookup failed: no results for hostname lookup: www.site1.com.
Traceback (most recent call last):
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\endpoints.py", line 975, in startConnectionAttempts
"no results for hostname lookup: {}".format(self._hostStr)
如您在日志中所见,handle_error 方法不适用于 /robot.txt URL 请求。 我做了一些研究,发现我们可以配置中间件来忽略一些错误,但到目前为止还没有成功。
这是对您的 handle_error
.
def handle_error(self, failure):
# this is the original request
request = failure.request
if failure.check(DNSLookupError):
self.logger.error('DNSLookupError : "%s"', request.url)
elif request.url.endswith('/robots.txt'):
pass
elif failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(TimeoutError, TCPTimedOutError):
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
您的日志示例显示了一个 DNS 查找错误,恕我直言 应该 记录该错误,而不管具体的 URL 是什么(即使不是,它也会失败对于 robots.txt
,可能意味着应该跳过整个域。
万一其他人正在阅读这篇文章,我做的一个小技巧解决方案是采用基础 class 并注释掉正在打印的额外细节:
class MycrawlerRobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool("CUSTOM_ROBOTSTXT_OBEY"):
raise NotConfigured
self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
self.crawler = crawler
self._parsers = {}
self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b"")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get("dont_obey_robotstxt"):
return
d = maybeDeferred(self.robot_parser, request, spider)
d.addCallback(self.process_request_2, request, spider)
return d
def process_request_2(self, rp, request, spider):
if rp is None:
return
useragent = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b"User-Agent", self._default_useragent)
if not rp.allowed(request.url, useragent):
logger.debug(
"Forbidden by robots.txt: %(request)s",
{"request": request},
extra={"spider": spider},
)
self.crawler.stats.inc_value("robotstxt/forbidden")
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={"dont_obey_robotstxt": True},
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value("robotstxt/request_count")
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
else:
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
# if failure.type is not IgnoreRequest:
# logger.error(
# "Error downloading %(request)s: %(f_exception)s",
# {"request": request, "f_exception": failure.value},
# exc_info=failure_to_exc_info(failure),
# extra={"spider": spider},
# )
if failure.type is not IgnoreRequest:
logger.error(f"Error downloading robots.txt: {request}")
return failure
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value("robotstxt/response_count")
self.crawler.stats.inc_value(
f"robotstxt/response_status_count/{response.status}"
)
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
rp_dfd.callback(rp)
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = f"robotstxt/exception_count/{failure.type}"
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)
然后我将其添加到 settings.py
:
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Custom one written so it doesn't log every 404 response
CUSTOM_ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
...
"mycrawler.middlewares.MycrawlerRobotsTxtMiddleware": 100,
}