Scrapy,从页面解析项目数据,然后按照 link 获取其他项目数据
Scrapy, Parse items data from page then follow link to get additional items data
我在从第一页抓取数据后抓取其他页面上的其他字段时遇到问题,例如:
这是我的代码:
from scrapy.selector import HtmlXPathSelector
from scrapy.http import HtmlResponse
from IMDB_Frompage.items import ImdbFrompageItem
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://www.imdb.com/search/title?count=100&ref_=nv_ch_mm_1&start=1&title_type=feature,tv_series,tv_movie"
class MySpider(CrawlSpider):
name = "imdb"
allowed_domains = ["imdb.com"]
start_urls = [URL]
DOWNLOAD_DELAY = 0.5
rules = (Rule(SgmlLinkExtractor(allow=('100&ref'), restrict_xpaths=('//span[@class="pagination"]/a[contains(text(),"Next")]')), callback='parse_page', follow=True),)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
item = ImdbFrompageItem()
links = hxs.select("//td[@class='title']")
items=[]
for link in links:
item = ImdbFrompageItem()
item['link'] = link.select("a/@href").extract()[0]
item['new_link'] ='http://www.imdb.com'+item['link']
new_links = ''.join(item['new_link'])
request = Request(new_links, callback=self.parsepage2)
request.meta['item'] = item
yield request
yield item
def parsepage2(self, response):
item = response.meta['item']
hxs = HtmlXPathSelector(response)
blocks = hxs.select("//td[@id='overview-top']")
for block in blocks:
item = ImdbFrompageItem()
item["title"] = block.select("h1[@class='header']/span[@itemprop='name']/text()").extract()
item["year"] = block.select("h1[@class='header']/span[@class='nobr']").extract()
item["description"] = block.select("p[@itemprop='description']/text()").extract()
yield item
部分结果为:
{"link": , "new_link": }
{"link": , "new_link": }
{"link": , "new_link": }
{"link": , "new_link": }
....
{"link": , "new_link": }
{"title": , "description":}
{"title": , "description":}
next page
{"link": , "new_link": }
{"link": , "new_link": }
{"link": , "new_link": }
{"title": , "description":}
我的结果不包含每个 link
的所有数据({"title": , "description":})
但我想要这样的东西:
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
....
{"link": , "new_link": }
{"title": , "description":}
next page
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
{"title": , "description":}
对我做错了什么有什么建议吗?
Scrapy不能保证所有的请求都按顺序解析,是无序.
执行顺序可能是这样的:
- 调用 parse1();
- 调用 parse1();
- 调用 parse1();
- 调用 parse2();
- .....
也许您可以像这样更改代码以获得您想要的内容:
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select("//td[@class='title']")
for link in links:
new_links = ''.join('http://www.imdb.com'+item['link'])
request = Request(new_links, callback=self.parsepage2)
request.meta['item'] = item
request.meta['link'] = link.select("a/@href").extract()[0]
request.meta['new_link'] = new_links
yield request
def parsepage2(self, response):
item = response.meta['item']
hxs = HtmlXPathSelector(response)
blocks = hxs.select("//td[@id='overview-top']")
for block in blocks:
item = ImdbFrompageItem()
item["link"] = response["link"]
item["new_link" = response["new_link"]
item["title"] = block.select("h1[@class='header']/span[@itemprop='name']/text()").extract()
item["year"] = block.select("h1[@class='header']/span[@class='nobr']").extract()
item["description"] = block.select("p[@itemprop='description']/text()").extract()
yield item
所以你会得到这样的结果:
{"link": , "new_link": ,"title": , "description":}
我不确定我的代码是否可以直接运行,我只是给你一个启发,让你实现你想要的。
我在从第一页抓取数据后抓取其他页面上的其他字段时遇到问题,例如:
这是我的代码:
from scrapy.selector import HtmlXPathSelector
from scrapy.http import HtmlResponse
from IMDB_Frompage.items import ImdbFrompageItem
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://www.imdb.com/search/title?count=100&ref_=nv_ch_mm_1&start=1&title_type=feature,tv_series,tv_movie"
class MySpider(CrawlSpider):
name = "imdb"
allowed_domains = ["imdb.com"]
start_urls = [URL]
DOWNLOAD_DELAY = 0.5
rules = (Rule(SgmlLinkExtractor(allow=('100&ref'), restrict_xpaths=('//span[@class="pagination"]/a[contains(text(),"Next")]')), callback='parse_page', follow=True),)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
item = ImdbFrompageItem()
links = hxs.select("//td[@class='title']")
items=[]
for link in links:
item = ImdbFrompageItem()
item['link'] = link.select("a/@href").extract()[0]
item['new_link'] ='http://www.imdb.com'+item['link']
new_links = ''.join(item['new_link'])
request = Request(new_links, callback=self.parsepage2)
request.meta['item'] = item
yield request
yield item
def parsepage2(self, response):
item = response.meta['item']
hxs = HtmlXPathSelector(response)
blocks = hxs.select("//td[@id='overview-top']")
for block in blocks:
item = ImdbFrompageItem()
item["title"] = block.select("h1[@class='header']/span[@itemprop='name']/text()").extract()
item["year"] = block.select("h1[@class='header']/span[@class='nobr']").extract()
item["description"] = block.select("p[@itemprop='description']/text()").extract()
yield item
部分结果为:
{"link": , "new_link": }
{"link": , "new_link": }
{"link": , "new_link": }
{"link": , "new_link": }
....
{"link": , "new_link": }
{"title": , "description":}
{"title": , "description":}
next page
{"link": , "new_link": }
{"link": , "new_link": }
{"link": , "new_link": }
{"title": , "description":}
我的结果不包含每个 link
的所有数据({"title": , "description":})但我想要这样的东西:
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
....
{"link": , "new_link": }
{"title": , "description":}
next page
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
{"title": , "description":}
{"link": , "new_link": }
{"title": , "description":}
对我做错了什么有什么建议吗?
Scrapy不能保证所有的请求都按顺序解析,是无序.
执行顺序可能是这样的:
- 调用 parse1();
- 调用 parse1();
- 调用 parse1();
- 调用 parse2();
- .....
也许您可以像这样更改代码以获得您想要的内容:
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select("//td[@class='title']")
for link in links:
new_links = ''.join('http://www.imdb.com'+item['link'])
request = Request(new_links, callback=self.parsepage2)
request.meta['item'] = item
request.meta['link'] = link.select("a/@href").extract()[0]
request.meta['new_link'] = new_links
yield request
def parsepage2(self, response):
item = response.meta['item']
hxs = HtmlXPathSelector(response)
blocks = hxs.select("//td[@id='overview-top']")
for block in blocks:
item = ImdbFrompageItem()
item["link"] = response["link"]
item["new_link" = response["new_link"]
item["title"] = block.select("h1[@class='header']/span[@itemprop='name']/text()").extract()
item["year"] = block.select("h1[@class='header']/span[@class='nobr']").extract()
item["description"] = block.select("p[@itemprop='description']/text()").extract()
yield item
所以你会得到这样的结果:
{"link": , "new_link": ,"title": , "description":}
我不确定我的代码是否可以直接运行,我只是给你一个启发,让你实现你想要的。