Crawlspider 在途中解析和添加来自 XML 个页面的链接
Crawlspider to parse and add links from XML pages on the way
我根据需要创建了一个 crawlspider,它运行良好。但是,在我正在抓取的网站上的某些类别(并非全部)中有某些 xml 站点地图。所以我想拥有解析这些类别的 .xml 站点地图并获取链接的功能,然后将其留给 crawlspider 以更深入地访问这些链接。
我知道有一个 SitemapSpider 和 XMLFeedSpider,但是我需要 crawlspider 的功能和 XMLFeedSpider,反之亦然。
如有任何帮助,我们将不胜感激。
您可以只向当前的 CrawlSpider 添加规则并自行解析 XML。您只需要在 rules
的顶部添加一条规则并在回调中编辑 sitemap_xml_xpath
:
import scrapy
import scrapy.linkextractors
import scrapy.spiders.crawl
class SmartlipoSpider(scrapy.spiders.crawl.CrawlSpider):
name = "myspider"
start_urls = ('http://example.com/',)
rules = (
scrapy.spiders.crawl.Rule(
scrapy.linkextractors.LinkExtractor(
allow=r'sitemap\.xml$',
),
callback='parse_sitemap_xml', follow=True,
),
# the other rules...
)
def parse_sitemap_xml(self, response):
sitemap_xml_xpath = '/urlset/url'
for url in response.xpath(sitemap_xml_xpath):
yield scrapy.Request(url)
# your other callbacks...
要使 CrawlSpider 在站点地图中使用 URLs,您可以为 XML 响应制作自定义 link 提取器,但它看起来像
CrawlSpider
does not process XML responses。所以你还需要覆盖 _requests_to_follow
来接受它们。
这是我尝试使用 sitemap.gz
URL 开始的示例蜘蛛(包含站点地图索引)
from scrapy.spiders.crawl import CrawlSpider, Rule
from scrapy.link import Link
from scrapy.http import Request
class XmlLinkExtractor():
def __init__(self, xpath, namespaces):
self.xpath = xpath
self.namespaces = namespaces
def extract_links(self, response):
selector = response.selector
if self.namespaces:
for i, ns in self.namespaces.items():
selector.register_namespace(i, ns)
for link in selector.xpath(self.xpath).extract():
yield Link(link)
class ExampleSitemapCrawlSpider(CrawlSpider):
name = "myspider"
start_urls = (
# link to a sitemap index file
'http://www.example.com/sitemap.gz',
# link to a sitemap file
#'http://www.example.com/sitemaps/sitemap-general.xml',
)
rules = (
# this handles sitemap indexes, following links to other sitemaps
Rule(XmlLinkExtractor('/sm:sitemapindex/sm:sitemap/sm:loc/text()',
{"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}),),
# this is for "leaf" pages in sitemaps
Rule(XmlLinkExtractor('/sm:urlset/sm:url/sm:loc/text()',
{"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}),
# here, defining the callback without follow=True
# makes the crawler stop at these pages level,
# not following deeper links
# unset the callback if you want those pages
# to go through other rules once downloaded
callback='parse_loc'),
# ... other rules
)
def _requests_to_follow(self, response):
# we need to override `_requests_to_follow`
# and comment these 2 lines, because they filter XML responses
#if not isinstance(response, HtmlResponse):
# return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=n, link_text=link.text)
yield rule.process_request(r)
def parse_loc(self, response):
self.logger.debug("parsing %r" % response)
根据您希望如何解析来自 /urlset/url/loc
的页面,您可能希望将不同的 URL 重定向到不同的回调(添加不同的规则,并自定义 XmlLinkExtractor
以允许过滤(或使用XPath过滤)
我根据需要创建了一个 crawlspider,它运行良好。但是,在我正在抓取的网站上的某些类别(并非全部)中有某些 xml 站点地图。所以我想拥有解析这些类别的 .xml 站点地图并获取链接的功能,然后将其留给 crawlspider 以更深入地访问这些链接。
我知道有一个 SitemapSpider 和 XMLFeedSpider,但是我需要 crawlspider 的功能和 XMLFeedSpider,反之亦然。
如有任何帮助,我们将不胜感激。
您可以只向当前的 CrawlSpider 添加规则并自行解析 XML。您只需要在 rules
的顶部添加一条规则并在回调中编辑 sitemap_xml_xpath
:
import scrapy
import scrapy.linkextractors
import scrapy.spiders.crawl
class SmartlipoSpider(scrapy.spiders.crawl.CrawlSpider):
name = "myspider"
start_urls = ('http://example.com/',)
rules = (
scrapy.spiders.crawl.Rule(
scrapy.linkextractors.LinkExtractor(
allow=r'sitemap\.xml$',
),
callback='parse_sitemap_xml', follow=True,
),
# the other rules...
)
def parse_sitemap_xml(self, response):
sitemap_xml_xpath = '/urlset/url'
for url in response.xpath(sitemap_xml_xpath):
yield scrapy.Request(url)
# your other callbacks...
要使 CrawlSpider 在站点地图中使用 URLs,您可以为 XML 响应制作自定义 link 提取器,但它看起来像
CrawlSpider
does not process XML responses。所以你还需要覆盖 _requests_to_follow
来接受它们。
这是我尝试使用 sitemap.gz
URL 开始的示例蜘蛛(包含站点地图索引)
from scrapy.spiders.crawl import CrawlSpider, Rule
from scrapy.link import Link
from scrapy.http import Request
class XmlLinkExtractor():
def __init__(self, xpath, namespaces):
self.xpath = xpath
self.namespaces = namespaces
def extract_links(self, response):
selector = response.selector
if self.namespaces:
for i, ns in self.namespaces.items():
selector.register_namespace(i, ns)
for link in selector.xpath(self.xpath).extract():
yield Link(link)
class ExampleSitemapCrawlSpider(CrawlSpider):
name = "myspider"
start_urls = (
# link to a sitemap index file
'http://www.example.com/sitemap.gz',
# link to a sitemap file
#'http://www.example.com/sitemaps/sitemap-general.xml',
)
rules = (
# this handles sitemap indexes, following links to other sitemaps
Rule(XmlLinkExtractor('/sm:sitemapindex/sm:sitemap/sm:loc/text()',
{"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}),),
# this is for "leaf" pages in sitemaps
Rule(XmlLinkExtractor('/sm:urlset/sm:url/sm:loc/text()',
{"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}),
# here, defining the callback without follow=True
# makes the crawler stop at these pages level,
# not following deeper links
# unset the callback if you want those pages
# to go through other rules once downloaded
callback='parse_loc'),
# ... other rules
)
def _requests_to_follow(self, response):
# we need to override `_requests_to_follow`
# and comment these 2 lines, because they filter XML responses
#if not isinstance(response, HtmlResponse):
# return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=n, link_text=link.text)
yield rule.process_request(r)
def parse_loc(self, response):
self.logger.debug("parsing %r" % response)
根据您希望如何解析来自 /urlset/url/loc
的页面,您可能希望将不同的 URL 重定向到不同的回调(添加不同的规则,并自定义 XmlLinkExtractor
以允许过滤(或使用XPath过滤)