Scrapy:存储损坏的外部链接并丢弃其余的
Scrapy: store broken external links and discard the rest
我希望 Scrapy 只存储损坏的外部 links(不同于 200、301 或 302 的响应代码)但我坚持这样做并且脚本继续存储每个外部 link 在输出文件上。这是我正在使用的:
@staticmethod
def remote_file_to_array(url):
return filter(None, urllib2.urlopen(url).read().splitlines())
@staticmethod
def sitemap_to_array(url):
results = []
body = urllib2.urlopen(url).read()
sitemap = Sitemap(body)
for item in sitemap:
results.append(item['loc'])
return results
def start_requests(self):
target_domain = self.arg_target_domain
print 'Target domain: ', target_domain
self.rules = (
Rule(LinkExtractor(allow_domains=[target_domain], unique=True),
follow=True),
Rule(LinkExtractor(unique=True),
callback='parse_item',
process_links='clean_links',
follow=False),
)
self._compile_rules()
start_urls = []
if self.arg_start_urls.endswith('.xml'):
print 'Sitemap detected!'
start_urls = self.sitemap_to_array(self.arg_start_urls)
elif self.arg_start_urls.endswith('.txt'):
print 'Remote url list detected!'
start_urls = self.remote_file_to_array(self.arg_start_urls)
else:
start_urls = [self.arg_start_urls]
print 'Start url count: ', len(start_urls)
first_url = start_urls[0]
print 'First url: ', first_url
for url in start_urls:
yield scrapy.Request(url, dont_filter=True)
def clean_links(self, links):
for link in links:
link.fragment = ''
link.url = link.url.split('
yield link
def parse_item(self, response):
item = BrokenLinksItem()
item['url'] = response.url
item['status'] = response.status
yield item
您需要在 Request
对象上传递 errback
参数,其工作方式与 callback
类似,但用于未接受的响应状态。
我不确定 rules
是否也可以实现,否则您需要定义自己的行为
最好的办法是使用下载器中间件来记录所需的响应。
from twisted.internet import defer
from twisted.internet.error import (ConnectError, ConnectionDone, ConnectionLost, ConnectionRefusedError,
DNSLookupError, TCPTimedOutError, TimeoutError,)
class BrokenLinkMiddleware(object):
ignore_http_status_codes = [200, 301, 302]
exceptions_to_log = (ConnectError, ConnectionDone, ConnectionLost, ConnectionRefusedError, DNSLookupError, IOError,
ResponseFailed, TCPTimedOutError, TimeoutError, defer.TimeoutError)
def process_response(self, request, response, spider):
if response.status not in self.ignore_http_status_codes:
# Do your logging here, response.url will have the url,
# response.status will have the status.
return response
def process_exception(self, request, exception, spider):
if isinstance(exception, self.exceptions_to_log):
# Do your logging here
它处理一些可能不表示损坏的 link(如 ConnectError
、TimeoutError
和 TCPTimedOutError
)的异常,但您可能仍想记录它们.
我希望 Scrapy 只存储损坏的外部 links(不同于 200、301 或 302 的响应代码)但我坚持这样做并且脚本继续存储每个外部 link 在输出文件上。这是我正在使用的:
@staticmethod
def remote_file_to_array(url):
return filter(None, urllib2.urlopen(url).read().splitlines())
@staticmethod
def sitemap_to_array(url):
results = []
body = urllib2.urlopen(url).read()
sitemap = Sitemap(body)
for item in sitemap:
results.append(item['loc'])
return results
def start_requests(self):
target_domain = self.arg_target_domain
print 'Target domain: ', target_domain
self.rules = (
Rule(LinkExtractor(allow_domains=[target_domain], unique=True),
follow=True),
Rule(LinkExtractor(unique=True),
callback='parse_item',
process_links='clean_links',
follow=False),
)
self._compile_rules()
start_urls = []
if self.arg_start_urls.endswith('.xml'):
print 'Sitemap detected!'
start_urls = self.sitemap_to_array(self.arg_start_urls)
elif self.arg_start_urls.endswith('.txt'):
print 'Remote url list detected!'
start_urls = self.remote_file_to_array(self.arg_start_urls)
else:
start_urls = [self.arg_start_urls]
print 'Start url count: ', len(start_urls)
first_url = start_urls[0]
print 'First url: ', first_url
for url in start_urls:
yield scrapy.Request(url, dont_filter=True)
def clean_links(self, links):
for link in links:
link.fragment = ''
link.url = link.url.split('
yield link
def parse_item(self, response):
item = BrokenLinksItem()
item['url'] = response.url
item['status'] = response.status
yield item
您需要在 Request
对象上传递 errback
参数,其工作方式与 callback
类似,但用于未接受的响应状态。
我不确定 rules
是否也可以实现,否则您需要定义自己的行为
最好的办法是使用下载器中间件来记录所需的响应。
from twisted.internet import defer
from twisted.internet.error import (ConnectError, ConnectionDone, ConnectionLost, ConnectionRefusedError,
DNSLookupError, TCPTimedOutError, TimeoutError,)
class BrokenLinkMiddleware(object):
ignore_http_status_codes = [200, 301, 302]
exceptions_to_log = (ConnectError, ConnectionDone, ConnectionLost, ConnectionRefusedError, DNSLookupError, IOError,
ResponseFailed, TCPTimedOutError, TimeoutError, defer.TimeoutError)
def process_response(self, request, response, spider):
if response.status not in self.ignore_http_status_codes:
# Do your logging here, response.url will have the url,
# response.status will have the status.
return response
def process_exception(self, request, exception, spider):
if isinstance(exception, self.exceptions_to_log):
# Do your logging here
它处理一些可能不表示损坏的 link(如 ConnectError
、TimeoutError
和 TCPTimedOutError
)的异常,但您可能仍想记录它们.