如何忽略 robots.txt 错误以显示在日志中?

How to ignore robots.txt errors to show in logs?

我正在研究爬虫,想通过遵守 robots.txt 来进行礼貌的爬行。由于它是一个广泛的爬网,日志文件变得更大并且更难处理,并且大多数日志记录是因为 robots.txt 在大多数站点中找不到。 所以我的问题是。有没有办法,我可以忽略 robots.txt 相关错误而不记录它们,因为我不需要知道我们是否找到它。

我已经有 errback 处理程序来处理我的爬虫的失败请求,但它不适用于 robots.txt,因为此请求是由 scrapy 中间件发出的 下面是我的代码: 蜘蛛:

class MySpider(scrapy.Spider):

name = 'mobile'

def start_requests(self):
    urls = [
         'https://site1.com',
         'http://site2.com'

     ]
     for url in urls:
         safe_no = 'test'
         yield scrapy.Request(url=url, callback=self.parse,
                              errback=self.handle_error, meta={'safe_no': safe_no})
def parse(self, response):

    safe_no = response.meta['safe_no']
    html_doc = response.body
    text_data, contacts, keep_no = self.get_contact(html_doc, response.url)
    # print(contacts,keep_no)
    link_found = False
    data = []
    parsed_uri = urlparse(response.url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    ###Parse data and get contact....

        if contacts:
            yield{
                'safe_no': safe_no,
                'url': response.url,
                'contacts': contacts,
                # 'text_data': text_data
                }



 def handle_error(self, failure):

        if failure.check(HttpError):
            # these exceptions come from HttpError spider middleware
            # you can get the non-200 response
            response = failure.value.response
            self.logger.error('HttpError : "%s"', response.url)

         elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.logger.error('DNSLookupError : "%s"', request.url)


        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            self.logger.error('TimeoutError : "%s"', request.url)


        else:
            request = failure.request
            self.logger.error('Can not connect : "%s" ', request.url)

下面是爬虫的日志:

    2019-01-10 15:33:36 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET http://www.site1.com/robots.txt>: DNS lookup failed: no results for hostname lookup: www.site1.com.
Traceback (most recent call last):
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
    result = result.throwExceptionIntoGenerator(g)
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
    return g.throw(self.type, self.value, self.tb)
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
    defer.returnValue((yield download_func(request=request,spider=spider)))
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\endpoints.py", line 975, in startConnectionAttempts
    "no results for hostname lookup: {}".format(self._hostStr)

如您在日志中所见,handle_error 方法不适用于 /robot.txt URL 请求。 我做了一些研究,发现我们可以配置中间件来忽略一些错误,但到目前为止还没有成功。

这是对您的 handle_error.

的一个小重构
def handle_error(self, failure):
    # this is the original request
    request = failure.request
    if failure.check(DNSLookupError):
        self.logger.error('DNSLookupError : "%s"', request.url)
    elif request.url.endswith('/robots.txt'):
        pass
    elif failure.check(HttpError):
        # these exceptions come from HttpError spider middleware
        # you can get the non-200 response
        response = failure.value.response
        self.logger.error('HttpError : "%s"', response.url)

    elif failure.check(TimeoutError, TCPTimedOutError):
        self.logger.error('TimeoutError : "%s"', request.url)

    else:
        request = failure.request
        self.logger.error('Can not connect : "%s" ', request.url)

您的日志示例显示了一个 DNS 查找错误,恕我直言 应该 记录该错误,而不管具体的 URL 是什么(即使不是,它也会失败对于 robots.txt,可能意味着应该跳过整个域。

万一其他人正在阅读这篇文章,我做的一个小技巧解决方案是采用基础 class 并注释掉正在打印的额外细节:


class MycrawlerRobotsTxtMiddleware:
    DOWNLOAD_PRIORITY = 1000

    def __init__(self, crawler):
        if not crawler.settings.getbool("CUSTOM_ROBOTSTXT_OBEY"):
            raise NotConfigured
        self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
        self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
        self.crawler = crawler
        self._parsers = {}
        self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))

        # check if parser dependencies are met, this should throw an error otherwise.
        self._parserimpl.from_crawler(self.crawler, b"")

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        if request.meta.get("dont_obey_robotstxt"):
            return
        d = maybeDeferred(self.robot_parser, request, spider)
        d.addCallback(self.process_request_2, request, spider)
        return d

    def process_request_2(self, rp, request, spider):
        if rp is None:
            return

        useragent = self._robotstxt_useragent
        if not useragent:
            useragent = request.headers.get(b"User-Agent", self._default_useragent)
        if not rp.allowed(request.url, useragent):
            logger.debug(
                "Forbidden by robots.txt: %(request)s",
                {"request": request},
                extra={"spider": spider},
            )
            self.crawler.stats.inc_value("robotstxt/forbidden")
            raise IgnoreRequest("Forbidden by robots.txt")

    def robot_parser(self, request, spider):
        url = urlparse_cached(request)
        netloc = url.netloc

        if netloc not in self._parsers:
            self._parsers[netloc] = Deferred()
            robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
            robotsreq = Request(
                robotsurl,
                priority=self.DOWNLOAD_PRIORITY,
                meta={"dont_obey_robotstxt": True},
            )
            dfd = self.crawler.engine.download(robotsreq, spider)
            dfd.addCallback(self._parse_robots, netloc, spider)
            dfd.addErrback(self._logerror, robotsreq, spider)
            dfd.addErrback(self._robots_error, netloc)
            self.crawler.stats.inc_value("robotstxt/request_count")

        if isinstance(self._parsers[netloc], Deferred):
            d = Deferred()

            def cb(result):
                d.callback(result)
                return result

            self._parsers[netloc].addCallback(cb)
            return d
        else:
            return self._parsers[netloc]

    def _logerror(self, failure, request, spider):
        # if failure.type is not IgnoreRequest:
        #     logger.error(
        #         "Error downloading %(request)s: %(f_exception)s",
        #         {"request": request, "f_exception": failure.value},
        #         exc_info=failure_to_exc_info(failure),
        #         extra={"spider": spider},
        #     )
        if failure.type is not IgnoreRequest:
            logger.error(f"Error downloading robots.txt: {request}")
        return failure

    def _parse_robots(self, response, netloc, spider):
        self.crawler.stats.inc_value("robotstxt/response_count")
        self.crawler.stats.inc_value(
            f"robotstxt/response_status_count/{response.status}"
        )
        rp = self._parserimpl.from_crawler(self.crawler, response.body)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = rp
        rp_dfd.callback(rp)

    def _robots_error(self, failure, netloc):
        if failure.type is not IgnoreRequest:
            key = f"robotstxt/exception_count/{failure.type}"
            self.crawler.stats.inc_value(key)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = None
        rp_dfd.callback(None)

然后我将其添加到 settings.py:


# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Custom one written so it doesn't log every 404 response
CUSTOM_ROBOTSTXT_OBEY = True

DOWNLOADER_MIDDLEWARES = {
    ...
    "mycrawler.middlewares.MycrawlerRobotsTxtMiddleware": 100,
}