Scrapy CrawlSpider:提取前获取数据link

Scrapy CrawlSpider: Getting data before extracting link

在 CrawlSpider 中,如何在提取每个 link 之前抓取图像中标记的字段“4 天前”? 下面提到的 CrawlSpider 工作正常。但是在 'parse_item' 中,我想添加一个名为 'Add posted' 的新字段,我希望在图像上标记该字段。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class PropertySpider(CrawlSpider):
    name = 'property'
    
    allowed_domains = ['www.openrent.co.uk']
    start_urls = [
        'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
        ]

    rules = (
        Rule(LinkExtractor(restrict_xpaths="//div[@id='property-data']/a"), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        yield {
            'Title': response.xpath("//h1[@class='property-title']/text()").get(),
            'Price': response.xpath("//h3[@class='perMonthPrice price-title']/text()").get(),
            'Links': response.url,
            'Add posted': ?
        }

要循环显示,您可以使用以下 xpath 接收该数据点:

x = response.xpath('//div[@class="timeStamp"]')
for i in x:
    yield {'result': i.xpath("./i/following-sibling::text()").get().strip() }

当使用 scrapy 爬虫的 Rule 对象时,提取的 link 的文本保存在名为 link_text 的请求的元字段中。您可以在 parse_item 方法中获取此值,并使用正则表达式提取时间信息。您可以从 docs 阅读更多相关信息。请参阅下面的示例。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re

class PropertySpider(CrawlSpider):
    name = 'property'
    
    allowed_domains = ['www.openrent.co.uk']
    start_urls = [
        'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
        ]

    rules = (
        Rule(LinkExtractor(restrict_xpaths="//div[@id='property-data']/a"), callback='parse_item', follow=True),
    )


    def parse_item(self, response):
        link_text = response.request.meta.get("link_text")
        m = re.search(r"(Last Updated.*ago)", link_text)
        if m:
            posted = m.group(1).replace("\xa0", " ")
        
        yield {
            'Title': response.xpath("//h1[@class='property-title']/text()").get(),
            'Price': response.xpath("//h3[@class='perMonthPrice price-title']/text()").get(),
            'Links': response.url,
            "Add posted": posted
        }