Scrapy keyerror 和下一页 url 不工作
Scrapy keyerror and next page url not working
我正在尝试 抓取 使用此页面作为开始 url:https://www.imdb.com/lists/tt0237478?ref_=tt_rls_sm
此页面有 3 个列表,其中一个列表有 100 多个项目。
我的代码 scrapes 只有 100 个项目并且没有从下一页获取数据。请检查代码有什么问题。
import scrapy
from urllib.parse import urljoin
class lisTopSpider(scrapy.Spider):
name= 'ImdbListsSpider'
allowed_domains = ['imdb.com']
start_urls = [
'https://www.imdb.com/lists/tt0237478'
]
def parse(self, response):
listsLinks = response.xpath('//div[2]/strong')
for link in listsLinks:
list_url = response.urljoin(link.xpath('.//a/@href').get())
yield scrapy.Request(list_url, callback=self.parse_list, meta={'list_url': list_url})
next_page_url = response.xpath('//a[@class="flat-button next-page "]/@href').get()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_list(self, response):
list_url = response.meta['list_url']
titles = response.xpath('//h3/a/@href').getall()
next_page_url = response.xpath('//a[@class="flat-button lister-page-next next-page"]/@href').get()
if next_page_url is not None:
next_page_url = urljoin('https://www.imdb.com',next_page_url)
print('here is next page url')
print(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse_list)
yield{
'listurl': list_url,
'titles': titles,
}
这里是错误
2020-05-06 21:09:29 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.imdb.com/list/ls055923961/?page=2> (referer: https://www.imdb.com/list/ls055923961/)
Traceback (most recent call last):
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37,
in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Python Projects\Scrapy\imdb_project\imdb_project\spiders\TopLists.py", line 29, in parse_list
list_url = response.meta['list_url']
KeyError: 'list_url'
您正在使用 Request.meta
在您的 parse-
方法中为您的 parse_list-method 提供 list_url
,但您忘记在 parse_list 用于下一页。
只需将 meta={'list_url': list_url}
添加到 parse_list
内的 Request
,它应该可以正常工作。
因此 parse_list 中下一页的处理应如下所示:
if next_page_url is not None:
next_page_url = urljoin('https://www.imdb.com', next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse_list, meta={'list_url': list_url})
顺便说一句:在 Scrapy 1.7 之后,处理用户信息的首选方式现在是 Request.cb_kwargs
(see "Caution"-part in the official docu here)
我正在尝试 抓取 使用此页面作为开始 url:https://www.imdb.com/lists/tt0237478?ref_=tt_rls_sm 此页面有 3 个列表,其中一个列表有 100 多个项目。 我的代码 scrapes 只有 100 个项目并且没有从下一页获取数据。请检查代码有什么问题。
import scrapy
from urllib.parse import urljoin
class lisTopSpider(scrapy.Spider):
name= 'ImdbListsSpider'
allowed_domains = ['imdb.com']
start_urls = [
'https://www.imdb.com/lists/tt0237478'
]
def parse(self, response):
listsLinks = response.xpath('//div[2]/strong')
for link in listsLinks:
list_url = response.urljoin(link.xpath('.//a/@href').get())
yield scrapy.Request(list_url, callback=self.parse_list, meta={'list_url': list_url})
next_page_url = response.xpath('//a[@class="flat-button next-page "]/@href').get()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_list(self, response):
list_url = response.meta['list_url']
titles = response.xpath('//h3/a/@href').getall()
next_page_url = response.xpath('//a[@class="flat-button lister-page-next next-page"]/@href').get()
if next_page_url is not None:
next_page_url = urljoin('https://www.imdb.com',next_page_url)
print('here is next page url')
print(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse_list)
yield{
'listurl': list_url,
'titles': titles,
}
这里是错误
2020-05-06 21:09:29 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.imdb.com/list/ls055923961/?page=2> (referer: https://www.imdb.com/list/ls055923961/)
Traceback (most recent call last):
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37,
in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Python Projects\Scrapy\imdb_project\imdb_project\spiders\TopLists.py", line 29, in parse_list
list_url = response.meta['list_url']
KeyError: 'list_url'
您正在使用 Request.meta
在您的 parse-
方法中为您的 parse_list-method 提供 list_url
,但您忘记在 parse_list 用于下一页。
只需将 meta={'list_url': list_url}
添加到 parse_list
内的 Request
,它应该可以正常工作。
因此 parse_list 中下一页的处理应如下所示:
if next_page_url is not None:
next_page_url = urljoin('https://www.imdb.com', next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse_list, meta={'list_url': list_url})
顺便说一句:在 Scrapy 1.7 之后,处理用户信息的首选方式现在是 Request.cb_kwargs
(see "Caution"-part in the official docu here)