Scrapy 中的嵌套收益请求

Nested yield requests in Scrapy

我想以递归方式抓取 beer_items 的 10 页评论,我正在 ratebeer.com 上建模。我想从这 10 页(每种特定啤酒)中收集所有评论,并将生成的文本与一个项目相关联。这是我的蜘蛛:

import scrapy
from scrapy.http import FormRequest, Request
from beerscraping.items import BeerscrapingItem


class BeerSpider(scrapy.Spider):
name = 'beerspider'
allowed_domains = ['www.ratebeer.com']
start_urls = ['http://www.ratebeer.com/search.php', ]

def parse(self, response):
    import string
    for c in ' ' + string.ascii_lowercase:
        formdata = {'BeerName': ' ',
                    'BeerStyles': '17',
                    'CountryID': '213',
                    'SortBy': '1'}
        formname = 'myform2'
        yield FormRequest.from_response(response,
                                        formname=formname,
                                        formdata=formdata,
                                        callback=self.parse_request)

def parse_request(self, response):
    xpath = "//td/span/a/@href"
    for link in response.xpath(xpath):
        url = 'http://www.ratebeer.com' + link.extract()
        request = Request(url, callback=self.parse_beer_data)
        request.meta['beer_id'] = url.split('/')[-2]  # Beer ID from URL
        yield request

def parse_name(self, data):
    result = data.extract()[0].split('(')[0]
    return result.strip("'")

def parse_abv(self, data):
    return data.extract()[-1]

def parse_rating(self, data):
    return data.extract()[0]

def parse_100_reviews(self, response):
    beer_item = BeerscrapingItem(response.meta['beer_item'])
    print "RESPONSE URL: ", response.url
    print "beer_item['keywords']:  ", beer_item['keywords']
    path_to_reviews = "//div[contains(@style, 'padding: 20px 10px 20px 0px')]/text()"
    for review in response.xpath(path_to_reviews):
        print "RESPONSE URL: ", response.url
        print "beer_item['keywords']: ", beer_item['keywords']
        beer_item['keywords'] += review.extract()
    page_no = int(response.url[-2])
    print "page_no: ", page_no
    if page_no < 3:  # If we haven't crawled over 10 pages of reviews yet
        print "DO WE ENTER THE LOOP????!?!?!?!"
        url = response.url[:-2] + str(page_no + 1) + '/'
        print "NEW URL: ", url
        request = Request(url, callback=self.parse_100_reviews)
        request.meta['beer_item'] = beer_item
        yield request

    yield beer_item

def parse_beer_data(self, response):
    path_to_name = "//div[contains(@class, 'user-header')]/h1/text()"
    path_to_brewery = "//big['Brewed By']/b/a/text()"
    path_to_abv = "//td/div/div/small/big/strong/text()"
    path_to_rating = "//div/span[contains(@itemprop, 'average')]/text()"

    beer_item = BeerscrapingItem()
    beer_item['id'] = response.meta['beer_id']
    beer_item['name'] = self.parse_name(response.xpath(path_to_name))
    beer_item['brewery'] = self.parse_name(response.xpath(path_to_brewery))
    beer_item['abv'] = self.parse_abv(response.xpath(path_to_abv))
    beer_item['rb_rating'] = self.parse_rating(response.xpath(path_to_rating))
    beer_item['keywords'] = ''
    request = Request(response.url + '1/1/', callback=self.parse_100_reviews)
    request.meta['beer_item'] = beer_item
    return request

所以我在 parse_beer_data() 中创建一个项目并将其交给 parse_100_reviews(每页 10 个)。评论页面的 URL 格式为 www.ratebeer.com/beer/(brand)/(brand_id)/1/(page_no)/ 所以我创建了一个 for-使用 (page_no) = 1, 2, ..., 10 循环迭代 10 个 url。 在抓取所有 10 个页面之前,我的蜘蛛不应该退出该循环,但事实并非如此。 取而代之的是,该项目仅在循环一次后就产生了。我有所有这些打印语句来验证 "page-no: 1" 是唯一被打印的 page_no。

非常感谢帮助。

"else".

这就是 parse_100_reviews 所缺少的。