Scrapy 中的嵌套收益请求
Nested yield requests in Scrapy
我想以递归方式抓取 beer_items 的 10 页评论,我正在 ratebeer.com 上建模。我想从这 10 页(每种特定啤酒)中收集所有评论,并将生成的文本与一个项目相关联。这是我的蜘蛛:
import scrapy
from scrapy.http import FormRequest, Request
from beerscraping.items import BeerscrapingItem
class BeerSpider(scrapy.Spider):
name = 'beerspider'
allowed_domains = ['www.ratebeer.com']
start_urls = ['http://www.ratebeer.com/search.php', ]
def parse(self, response):
import string
for c in ' ' + string.ascii_lowercase:
formdata = {'BeerName': ' ',
'BeerStyles': '17',
'CountryID': '213',
'SortBy': '1'}
formname = 'myform2'
yield FormRequest.from_response(response,
formname=formname,
formdata=formdata,
callback=self.parse_request)
def parse_request(self, response):
xpath = "//td/span/a/@href"
for link in response.xpath(xpath):
url = 'http://www.ratebeer.com' + link.extract()
request = Request(url, callback=self.parse_beer_data)
request.meta['beer_id'] = url.split('/')[-2] # Beer ID from URL
yield request
def parse_name(self, data):
result = data.extract()[0].split('(')[0]
return result.strip("'")
def parse_abv(self, data):
return data.extract()[-1]
def parse_rating(self, data):
return data.extract()[0]
def parse_100_reviews(self, response):
beer_item = BeerscrapingItem(response.meta['beer_item'])
print "RESPONSE URL: ", response.url
print "beer_item['keywords']: ", beer_item['keywords']
path_to_reviews = "//div[contains(@style, 'padding: 20px 10px 20px 0px')]/text()"
for review in response.xpath(path_to_reviews):
print "RESPONSE URL: ", response.url
print "beer_item['keywords']: ", beer_item['keywords']
beer_item['keywords'] += review.extract()
page_no = int(response.url[-2])
print "page_no: ", page_no
if page_no < 3: # If we haven't crawled over 10 pages of reviews yet
print "DO WE ENTER THE LOOP????!?!?!?!"
url = response.url[:-2] + str(page_no + 1) + '/'
print "NEW URL: ", url
request = Request(url, callback=self.parse_100_reviews)
request.meta['beer_item'] = beer_item
yield request
yield beer_item
def parse_beer_data(self, response):
path_to_name = "//div[contains(@class, 'user-header')]/h1/text()"
path_to_brewery = "//big['Brewed By']/b/a/text()"
path_to_abv = "//td/div/div/small/big/strong/text()"
path_to_rating = "//div/span[contains(@itemprop, 'average')]/text()"
beer_item = BeerscrapingItem()
beer_item['id'] = response.meta['beer_id']
beer_item['name'] = self.parse_name(response.xpath(path_to_name))
beer_item['brewery'] = self.parse_name(response.xpath(path_to_brewery))
beer_item['abv'] = self.parse_abv(response.xpath(path_to_abv))
beer_item['rb_rating'] = self.parse_rating(response.xpath(path_to_rating))
beer_item['keywords'] = ''
request = Request(response.url + '1/1/', callback=self.parse_100_reviews)
request.meta['beer_item'] = beer_item
return request
所以我在 parse_beer_data() 中创建一个项目并将其交给 parse_100_reviews(每页 10 个)。评论页面的 URL 格式为 www.ratebeer.com/beer/(brand)/(brand_id)/1/(page_no)/ 所以我创建了一个 for-使用 (page_no) = 1, 2, ..., 10 循环迭代 10 个 url。
在抓取所有 10 个页面之前,我的蜘蛛不应该退出该循环,但事实并非如此。
取而代之的是,该项目仅在循环一次后就产生了。我有所有这些打印语句来验证 "page-no: 1" 是唯一被打印的 page_no。
非常感谢帮助。
"else".
这就是 parse_100_reviews 所缺少的。
我想以递归方式抓取 beer_items 的 10 页评论,我正在 ratebeer.com 上建模。我想从这 10 页(每种特定啤酒)中收集所有评论,并将生成的文本与一个项目相关联。这是我的蜘蛛:
import scrapy
from scrapy.http import FormRequest, Request
from beerscraping.items import BeerscrapingItem
class BeerSpider(scrapy.Spider):
name = 'beerspider'
allowed_domains = ['www.ratebeer.com']
start_urls = ['http://www.ratebeer.com/search.php', ]
def parse(self, response):
import string
for c in ' ' + string.ascii_lowercase:
formdata = {'BeerName': ' ',
'BeerStyles': '17',
'CountryID': '213',
'SortBy': '1'}
formname = 'myform2'
yield FormRequest.from_response(response,
formname=formname,
formdata=formdata,
callback=self.parse_request)
def parse_request(self, response):
xpath = "//td/span/a/@href"
for link in response.xpath(xpath):
url = 'http://www.ratebeer.com' + link.extract()
request = Request(url, callback=self.parse_beer_data)
request.meta['beer_id'] = url.split('/')[-2] # Beer ID from URL
yield request
def parse_name(self, data):
result = data.extract()[0].split('(')[0]
return result.strip("'")
def parse_abv(self, data):
return data.extract()[-1]
def parse_rating(self, data):
return data.extract()[0]
def parse_100_reviews(self, response):
beer_item = BeerscrapingItem(response.meta['beer_item'])
print "RESPONSE URL: ", response.url
print "beer_item['keywords']: ", beer_item['keywords']
path_to_reviews = "//div[contains(@style, 'padding: 20px 10px 20px 0px')]/text()"
for review in response.xpath(path_to_reviews):
print "RESPONSE URL: ", response.url
print "beer_item['keywords']: ", beer_item['keywords']
beer_item['keywords'] += review.extract()
page_no = int(response.url[-2])
print "page_no: ", page_no
if page_no < 3: # If we haven't crawled over 10 pages of reviews yet
print "DO WE ENTER THE LOOP????!?!?!?!"
url = response.url[:-2] + str(page_no + 1) + '/'
print "NEW URL: ", url
request = Request(url, callback=self.parse_100_reviews)
request.meta['beer_item'] = beer_item
yield request
yield beer_item
def parse_beer_data(self, response):
path_to_name = "//div[contains(@class, 'user-header')]/h1/text()"
path_to_brewery = "//big['Brewed By']/b/a/text()"
path_to_abv = "//td/div/div/small/big/strong/text()"
path_to_rating = "//div/span[contains(@itemprop, 'average')]/text()"
beer_item = BeerscrapingItem()
beer_item['id'] = response.meta['beer_id']
beer_item['name'] = self.parse_name(response.xpath(path_to_name))
beer_item['brewery'] = self.parse_name(response.xpath(path_to_brewery))
beer_item['abv'] = self.parse_abv(response.xpath(path_to_abv))
beer_item['rb_rating'] = self.parse_rating(response.xpath(path_to_rating))
beer_item['keywords'] = ''
request = Request(response.url + '1/1/', callback=self.parse_100_reviews)
request.meta['beer_item'] = beer_item
return request
所以我在 parse_beer_data() 中创建一个项目并将其交给 parse_100_reviews(每页 10 个)。评论页面的 URL 格式为 www.ratebeer.com/beer/(brand)/(brand_id)/1/(page_no)/ 所以我创建了一个 for-使用 (page_no) = 1, 2, ..., 10 循环迭代 10 个 url。 在抓取所有 10 个页面之前,我的蜘蛛不应该退出该循环,但事实并非如此。 取而代之的是,该项目仅在循环一次后就产生了。我有所有这些打印语句来验证 "page-no: 1" 是唯一被打印的 page_no。
非常感谢帮助。
"else".
这就是 parse_100_reviews 所缺少的。