使用 LinkExtractors 从以前的页面中抓取信息

Scraping information from previous pages using LinkExtractors

我想知道是否可以使用 LinkExtractors 从以前的页面中抓取信息。这个问题与我之前的问题有关

我已经上传了该问题的答案,并更改了国家/地区的 xpath。提供的 xpath 从第一页抓取国家。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader

class ZooplasItem(scrapy.Item):
    stuff = Field()
    country = Field()

class ZooplasSpider(CrawlSpider):
    name = 'zooplas'
    allowed_domains = ['zoopla.co.uk']
    start_urls = ['https://www.zoopla.co.uk/overseas/']

    rules = (
        Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
        Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
        Rule(LinkExtractor(restrict_xpaths="//a[contains(@class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
    )

    def parse_item(self, response):
        # here you are on the details page for each property
        loader = ItemLoader(ZooplasItem(), response=response)
        loader.default_output_processor = TakeFirst()
        loader.add_xpath("stuff", "//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
        loader.add_xpath("country","(//ul[@class='list-inline list-unstyled'])[1]//li//a//text()")
        yield loader.load_item()

if __name__ == '__main__':
    process = CrawlerProcess(
        settings = {
            'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
            'FEEDS': {
                'zoopla.jl': {
                    'format': 'jsonlines'
                }
            }
        }
    )
    process.crawl(ZooplasSpider)
    process.start()

但是,这会打印出以下输出:

'country':'(//ul[@class='list-inline list-unstyled'])[1]//li//a//text()'

CrawlSpider 适用于您希望自动跟踪与特定模式匹配的链接的情况。如果你想从以前的页面中获取信息,你必须单独 parse 每个页面并通过 meta 请求参数或 cb_kwargs 参数传递信息。您可以在任何解析方法中向 meta 值添加任何信息。

我重构了上面的代码以使用普通的 scrapy Spider class 并在 meta 关键字中传递第一页的国家值,然后在后续的解析方法中捕获它。

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader

class ZooplasItem(scrapy.Item):
    stuff = Field()
    country = Field()

class ZooplasSpider(scrapy.Spider):
    name = 'zooplas'
    allowed_domains = ['zoopla.co.uk']
    start_urls = ['https://www.zoopla.co.uk/overseas/']

    def parse(self, response):
        # grab the countries links and follow them
        # you can do some other parsing here and pass the information down to the subsequent parse methods
        for link in response.xpath("(//ul[@class='list-inline list-unstyled'])[1]/li"):
            country = link.xpath("./h4/a/text()").get()
            url = link.xpath("./h4/a/@href").get()
            yield response.follow(url, meta={"country": country}, callback=self.parse_country)

    def parse_country(self, response):
        # follow link to individual listing and pass the country value in the meta object
        # you can pass any information in the meta dictionary
        for link in response.xpath("//a[contains(@class,'listing-result')]"):
            yield response.follow(link, meta={"country": response.meta.get("country")}, callback=self.parse_item)

        # follow pagination links
        next_page = response.xpath("//a[contains(text(),'Next')]/@href").get()
        if next_page:
            response.follow(next_page, callback=self.parse_country)

    def parse_item(self, response):
        # here you are on the details page for each property
        loader = ItemLoader(ZooplasItem(), response=response)
        loader.default_output_processor = TakeFirst()
        loader.add_xpath("stuff", "//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
        loader.add_value("country",response.meta.get('country')) # retrieve the country name from the 
        yield loader.load_item()

if __name__ == '__main__':
    process = CrawlerProcess(
        settings = {
            'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
            'FEEDS': {
                'zoopla.jl': {
                    'format': 'jsonlines'
                }
            }
        }
    )
    process.crawl(ZooplasSpider)
    process.start()