Scrapy CrawlSpider:提取前获取数据link
Scrapy CrawlSpider: Getting data before extracting link
在 CrawlSpider 中,如何在提取每个 link 之前抓取图像中标记的字段“4 天前”?
下面提到的 CrawlSpider 工作正常。但是在 'parse_item' 中,我想添加一个名为 'Add posted' 的新字段,我希望在图像上标记该字段。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[@id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'Title': response.xpath("//h1[@class='property-title']/text()").get(),
'Price': response.xpath("//h3[@class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
'Add posted': ?
}
要循环显示,您可以使用以下 xpath 接收该数据点:
x = response.xpath('//div[@class="timeStamp"]')
for i in x:
yield {'result': i.xpath("./i/following-sibling::text()").get().strip() }
当使用 scrapy 爬虫的 Rule
对象时,提取的 link 的文本保存在名为 link_text
的请求的元字段中。您可以在 parse_item
方法中获取此值,并使用正则表达式提取时间信息。您可以从 docs 阅读更多相关信息。请参阅下面的示例。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[@id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
link_text = response.request.meta.get("link_text")
m = re.search(r"(Last Updated.*ago)", link_text)
if m:
posted = m.group(1).replace("\xa0", " ")
yield {
'Title': response.xpath("//h1[@class='property-title']/text()").get(),
'Price': response.xpath("//h3[@class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
"Add posted": posted
}
在 CrawlSpider 中,如何在提取每个 link 之前抓取图像中标记的字段“4 天前”? 下面提到的 CrawlSpider 工作正常。但是在 'parse_item' 中,我想添加一个名为 'Add posted' 的新字段,我希望在图像上标记该字段。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[@id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'Title': response.xpath("//h1[@class='property-title']/text()").get(),
'Price': response.xpath("//h3[@class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
'Add posted': ?
}
要循环显示,您可以使用以下 xpath 接收该数据点:
x = response.xpath('//div[@class="timeStamp"]')
for i in x:
yield {'result': i.xpath("./i/following-sibling::text()").get().strip() }
当使用 scrapy 爬虫的 Rule
对象时,提取的 link 的文本保存在名为 link_text
的请求的元字段中。您可以在 parse_item
方法中获取此值,并使用正则表达式提取时间信息。您可以从 docs 阅读更多相关信息。请参阅下面的示例。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[@id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
link_text = response.request.meta.get("link_text")
m = re.search(r"(Last Updated.*ago)", link_text)
if m:
posted = m.group(1).replace("\xa0", " ")
yield {
'Title': response.xpath("//h1[@class='property-title']/text()").get(),
'Price': response.xpath("//h3[@class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
"Add posted": posted
}