使用 LinkExtractors 从以前的页面中抓取信息
Scraping information from previous pages using LinkExtractors
我想知道是否可以使用 LinkExtractors
从以前的页面中抓取信息。这个问题与我之前的问题有关
我已经上传了该问题的答案,并更改了国家/地区的 xpath。提供的 xpath 从第一页抓取国家。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(@class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","(//ul[@class='list-inline list-unstyled'])[1]//li//a//text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()
但是,这会打印出以下输出:
'country':'(//ul[@class='list-inline list-unstyled'])[1]//li//a//text()'
CrawlSpider
适用于您希望自动跟踪与特定模式匹配的链接的情况。如果你想从以前的页面中获取信息,你必须单独 parse
每个页面并通过 meta
请求参数或 cb_kwargs
参数传递信息。您可以在任何解析方法中向 meta
值添加任何信息。
我重构了上面的代码以使用普通的 scrapy Spider
class 并在 meta 关键字中传递第一页的国家值,然后在后续的解析方法中捕获它。
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
def parse(self, response):
# grab the countries links and follow them
# you can do some other parsing here and pass the information down to the subsequent parse methods
for link in response.xpath("(//ul[@class='list-inline list-unstyled'])[1]/li"):
country = link.xpath("./h4/a/text()").get()
url = link.xpath("./h4/a/@href").get()
yield response.follow(url, meta={"country": country}, callback=self.parse_country)
def parse_country(self, response):
# follow link to individual listing and pass the country value in the meta object
# you can pass any information in the meta dictionary
for link in response.xpath("//a[contains(@class,'listing-result')]"):
yield response.follow(link, meta={"country": response.meta.get("country")}, callback=self.parse_item)
# follow pagination links
next_page = response.xpath("//a[contains(text(),'Next')]/@href").get()
if next_page:
response.follow(next_page, callback=self.parse_country)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_value("country",response.meta.get('country')) # retrieve the country name from the
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()
我想知道是否可以使用 LinkExtractors
从以前的页面中抓取信息。这个问题与我之前的问题有关
我已经上传了该问题的答案,并更改了国家/地区的 xpath。提供的 xpath 从第一页抓取国家。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(@class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","(//ul[@class='list-inline list-unstyled'])[1]//li//a//text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()
但是,这会打印出以下输出:
'country':'(//ul[@class='list-inline list-unstyled'])[1]//li//a//text()'
CrawlSpider
适用于您希望自动跟踪与特定模式匹配的链接的情况。如果你想从以前的页面中获取信息,你必须单独 parse
每个页面并通过 meta
请求参数或 cb_kwargs
参数传递信息。您可以在任何解析方法中向 meta
值添加任何信息。
我重构了上面的代码以使用普通的 scrapy Spider
class 并在 meta 关键字中传递第一页的国家值,然后在后续的解析方法中捕获它。
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
def parse(self, response):
# grab the countries links and follow them
# you can do some other parsing here and pass the information down to the subsequent parse methods
for link in response.xpath("(//ul[@class='list-inline list-unstyled'])[1]/li"):
country = link.xpath("./h4/a/text()").get()
url = link.xpath("./h4/a/@href").get()
yield response.follow(url, meta={"country": country}, callback=self.parse_country)
def parse_country(self, response):
# follow link to individual listing and pass the country value in the meta object
# you can pass any information in the meta dictionary
for link in response.xpath("//a[contains(@class,'listing-result')]"):
yield response.follow(link, meta={"country": response.meta.get("country")}, callback=self.parse_item)
# follow pagination links
next_page = response.xpath("//a[contains(text(),'Next')]/@href").get()
if next_page:
response.follow(next_page, callback=self.parse_country)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_value("country",response.meta.get('country')) # retrieve the country name from the
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()