Scrapy 管道只保存一页结果
Scrapy pipeline only save one page of results
我有一个蜘蛛要爬行course_tal,它有一个管道可以保存两种类型的项目:
moocs.csv 其中包含课程数据。
moocs_review.csv 其中包含评论数据。
这是我的蜘蛛代码:
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from urlparse import urljoin
from moocs.items import MoocsItem,MoocsReviewItem
class MoocsSpiderSpider(scrapy.Spider):
name = "moocs_spider"
#allowed_domains = ["https://www.coursetalk.com/subjects/data-science/courses"]
start_urls = (
'https://www.coursetalk.com/subjects/data-science/courses',
)
def parse(self, response):
courses_xpath = '//*[@class="course-listing-card"]//a[contains(@href, "/courses/")]/@href'
courses_url = [urljoin(response.url,relative_url) for relative_url in response.xpath(courses_xpath).extract()]
for course_url in courses_url[0:3]:
print course_url
yield Request(url=course_url, callback=self.parse_reviews)
next_page_url = response.xpath('//*[@class="js-course-pagination"]//a[contains(@aria-label,"Next")]/@href').extract()
yield Request(url=next_page_url, callback=self.parse)
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[@class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[@class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[@class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[@class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
l.add_value('course_provider', response.url)
l.add_xpath('course_cost', '//*[@class="course-details-panel__course-cost"]//text()')
l.add_xpath('university', '//*[@class="course-info__school__name"]//text()[2]')
#'//*[@class="course-info__school__name"]'
item = l.load_item()
for review in response.xpath('//*[@class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[@class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[@class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[@class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[@itemprop="datePublished"]/@datetime')
r.add_xpath('score', './/*[@class="sr-only"]//text()')
yield r.load_item()
yield item
进入每个课程页面并将详细信息保存到相应的项目中。我在这里获取分页:
next_page_url = response.xpath('//*[@class="js-course-pagination"]//a[contains(@aria-label,"Next")]/@href').extract()
蜘蛛程序转到下一页,但结果未保存在输出文件中。
我猜问题出在创建文件的管道中:
class MultiCSVItemPipeline(object):
CSVDir = '/moocs/scripts/moocs/moocs/'
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
如果您使用 -t csv,这也可以。而不是管道
scrapy crawl moocs -t csv -o moocs.csv --loglevel=INFO
这将自动在 spider 文件夹中创建一个文件。
您确定蜘蛛程序正确地进行了分页吗?
当你这样做时:
next_page_url = response.xpath('//*[@class="js-course-pagination"]//a[contains(@aria-label,"Next")]/@href').extract()
extract()
returns 结果列表,然后将其传递到 Request
:
的 url
参数中
yield Request(url=next_page_url, callback=self.parse)
但是 url
必须是字符串或 unicode 值,因此,这样做会产生以下错误:
TypeError: Request url must be str or unicode, got list:
可以用extract_first()
的方法解决,我也会检查这个值是不是None:
next_page_url = response.xpath('//*[@class="js-course-pagination"]//a[contains(@aria-label,"Next")]/@href').extract_first()
if next_page_url:
yield Request(url=next_page_url) # parse is the callback by default
请试试这个并告诉我它是否解决了你的问题
我有一个蜘蛛要爬行course_tal,它有一个管道可以保存两种类型的项目:
moocs.csv 其中包含课程数据。 moocs_review.csv 其中包含评论数据。
这是我的蜘蛛代码:
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from urlparse import urljoin
from moocs.items import MoocsItem,MoocsReviewItem
class MoocsSpiderSpider(scrapy.Spider):
name = "moocs_spider"
#allowed_domains = ["https://www.coursetalk.com/subjects/data-science/courses"]
start_urls = (
'https://www.coursetalk.com/subjects/data-science/courses',
)
def parse(self, response):
courses_xpath = '//*[@class="course-listing-card"]//a[contains(@href, "/courses/")]/@href'
courses_url = [urljoin(response.url,relative_url) for relative_url in response.xpath(courses_xpath).extract()]
for course_url in courses_url[0:3]:
print course_url
yield Request(url=course_url, callback=self.parse_reviews)
next_page_url = response.xpath('//*[@class="js-course-pagination"]//a[contains(@aria-label,"Next")]/@href').extract()
yield Request(url=next_page_url, callback=self.parse)
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[@class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[@class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[@class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[@class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
l.add_value('course_provider', response.url)
l.add_xpath('course_cost', '//*[@class="course-details-panel__course-cost"]//text()')
l.add_xpath('university', '//*[@class="course-info__school__name"]//text()[2]')
#'//*[@class="course-info__school__name"]'
item = l.load_item()
for review in response.xpath('//*[@class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[@class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[@class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[@class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[@itemprop="datePublished"]/@datetime')
r.add_xpath('score', './/*[@class="sr-only"]//text()')
yield r.load_item()
yield item
进入每个课程页面并将详细信息保存到相应的项目中。我在这里获取分页:
next_page_url = response.xpath('//*[@class="js-course-pagination"]//a[contains(@aria-label,"Next")]/@href').extract()
蜘蛛程序转到下一页,但结果未保存在输出文件中。
我猜问题出在创建文件的管道中:
class MultiCSVItemPipeline(object):
CSVDir = '/moocs/scripts/moocs/moocs/'
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
如果您使用 -t csv,这也可以。而不是管道
scrapy crawl moocs -t csv -o moocs.csv --loglevel=INFO
这将自动在 spider 文件夹中创建一个文件。
您确定蜘蛛程序正确地进行了分页吗?
当你这样做时:
next_page_url = response.xpath('//*[@class="js-course-pagination"]//a[contains(@aria-label,"Next")]/@href').extract()
extract()
returns 结果列表,然后将其传递到 Request
:
url
参数中
yield Request(url=next_page_url, callback=self.parse)
但是 url
必须是字符串或 unicode 值,因此,这样做会产生以下错误:
TypeError: Request url must be str or unicode, got list:
可以用extract_first()
的方法解决,我也会检查这个值是不是None:
next_page_url = response.xpath('//*[@class="js-course-pagination"]//a[contains(@aria-label,"Next")]/@href').extract_first()
if next_page_url:
yield Request(url=next_page_url) # parse is the callback by default
请试试这个并告诉我它是否解决了你的问题