使用 Scrapy 1.5 抓取多级菜单
Scrape multilevel menu using Scrapy 1.5
我正在尝试从多级菜单中获取所有链接。
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
import scrapy
from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BbcSpider(CrawlSpider):
name = 'bbc'
allowed_domains = ['bbcgoodfood.com']
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[\w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[\w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
)
def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
Results of menu scraping
但我无法理解如何在 collection 标题之前填充空的第一列。
现在我有:
空 |牛排食谱 | https://www.bbcgoodfood.com/recipes/collection/steak
但我需要:
肉类 |牛排食谱 | https://www.bbcgoodfood.com/recipes/collection/steak
有人可以告诉我需要做什么才能获得第一列中子类别的结果吗?
感谢大家)
使用 CrawlSpider
的规则实际上无法实现您想要的(至少不能以简单的方式实现)。
Passing additional data to callback functions.
中记录了执行此操作的常用方法
您将在第一个回调中提取类别,然后创建一个新请求,在 meta
字典中传递此信息。
我正在尝试从多级菜单中获取所有链接。
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
import scrapy
from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BbcSpider(CrawlSpider):
name = 'bbc'
allowed_domains = ['bbcgoodfood.com']
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[\w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[\w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
)
def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
Results of menu scraping 但我无法理解如何在 collection 标题之前填充空的第一列。
现在我有:
空 |牛排食谱 | https://www.bbcgoodfood.com/recipes/collection/steak
但我需要:
肉类 |牛排食谱 | https://www.bbcgoodfood.com/recipes/collection/steak
有人可以告诉我需要做什么才能获得第一列中子类别的结果吗?
感谢大家)
使用 CrawlSpider
的规则实际上无法实现您想要的(至少不能以简单的方式实现)。
Passing additional data to callback functions.
中记录了执行此操作的常用方法
您将在第一个回调中提取类别,然后创建一个新请求,在 meta
字典中传递此信息。