使用 Scrapy 从两个页面中提取数据
Extract data from two pages with Scrapy
我有一个议程作为起始页。此页面包含活动的开始时间和标题以及指向每个活动的详细信息页面的链接。
我的蜘蛛在每个事件的详细信息页面上提取所有事件详细信息(描述、位置等),除了开始时间我必须在我的开始页面上提取。
如何从开始页面和每个详细信息页面上的其他数据中提取开始时间?
好斗的方法是什么?使用 meta['item'] ?我不明白...
这是我的蜘蛛现在。非常感谢任何帮助!
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_agenda_contents)
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = LuItem()
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item
编辑:
我尝试使用 request.meta['item']
从开始页面中提取开始时间,并在每个事件的开始页面中获取 所有 开始时间的列表。如何获取 each 事件的开始时间?
有人可以告诉我正确的方向吗?
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
item = LuItem()
item['StartTime'] = response.xpath('//div[contains(., "H")]/span/text()').extract()
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
request = scrapy.Request(url, callback=self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item
你是对的。在你的情况下使用 meta 会做到这一点。请在此处查看官方文档:http://doc.scrapy.org/en/latest/topics/request-response.html#passing-additional-data-to-callback-functions
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
这有效:
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
StartTimes = response.xpath('//div[@class="toggle_container_show"]/div/span/text()').extract()
urls =response.xpath('//div[@class="toggle_container_show"]/div/a/@href').extract()
for StartTime,url in zip(StartTimes,urls):
item = LuItem()
item['StartTime'] = StartTime
request = Request(url,callback = self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item
我有一个议程作为起始页。此页面包含活动的开始时间和标题以及指向每个活动的详细信息页面的链接。
我的蜘蛛在每个事件的详细信息页面上提取所有事件详细信息(描述、位置等),除了开始时间我必须在我的开始页面上提取。
如何从开始页面和每个详细信息页面上的其他数据中提取开始时间? 好斗的方法是什么?使用 meta['item'] ?我不明白... 这是我的蜘蛛现在。非常感谢任何帮助!
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_agenda_contents)
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = LuItem()
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item
编辑:
我尝试使用 request.meta['item']
从开始页面中提取开始时间,并在每个事件的开始页面中获取 所有 开始时间的列表。如何获取 each 事件的开始时间?
有人可以告诉我正确的方向吗?
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
item = LuItem()
item['StartTime'] = response.xpath('//div[contains(., "H")]/span/text()').extract()
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
request = scrapy.Request(url, callback=self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item
你是对的。在你的情况下使用 meta 会做到这一点。请在此处查看官方文档:http://doc.scrapy.org/en/latest/topics/request-response.html#passing-additional-data-to-callback-functions
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
这有效:
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
StartTimes = response.xpath('//div[@class="toggle_container_show"]/div/span/text()').extract()
urls =response.xpath('//div[@class="toggle_container_show"]/div/a/@href').extract()
for StartTime,url in zip(StartTimes,urls):
item = LuItem()
item['StartTime'] = StartTime
request = Request(url,callback = self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[@class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]/h1[@id]/text()').extract()
item['Description'] = sel.xpath('div[@class="content"]/div/div[@class="sliderContent"]//p').extract()
yield item