递归运行所有 yield 请求的 Scrapy 输出文件 - 如何
Scrapy output file that recursively runs all the yield requests - how to
所以我有一个 scrapy spider 如下:
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
if subject_link is not None:
subject_data = scrapy.Request(subject_link, callback=self.parse_course)
yield {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
'subject_data': subject_data,
}
def parse_course(self, response):
subject_id = response.css('::attr(id)').extract_first().strip()
for course in response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
if course_link is not None:
course_data = scrapy.Request(course_link, callback=self.parse_class)
yield {
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'course_data': course_data,
}
def parse_class(self, response):
course_id = response.css('::attr(id)').extract_first().strip()
for section in response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
yield {
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
}
我想要一个输出 json 文件,它具有如下树结构:
{"subject_id": "...", "subject_name": "...", "subject_link": "...", "subject_data":
{"course_id": "...", "course_link": "...", "course_name": "...", "course_data":
{"course_id": "...", "section_link": "...", "section_name": "..."}
}
}
然而我只得到这个:
{"subject_id": "...", "subject_data": "<Request GET http://example.com/something>", "subject_name": "...", "subject_link": "..."}
据我了解,这是因为 yield 代码尚未执行。我将如何调用完全调用所有请求的 "scrapy crawl courses -o courses.json" 的等价物?如果这不可能开箱即用,我怎么能自己做呢?我以后可以在 python 文件中导入 json 和 运行 http://example.com/something> 以及以下文件吗?
我知道有很多代码,但应该澄清一下。
感谢您的帮助!
我看到了 2 种方法:
- 要么增量构建数据,然后使用
Request.meta
dict 将数据传递给每个回调。参见 Passing additional data to callback functions
或
- 使用类似scrapy-inline-requests的东西(待测试)
方法一
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
if subject_link is not None:
subject_data = scrapy.Request(subject_link, callback=self.parse_course)
# build a dict with the info we have so far
subject_info = {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
}
# add this to the Request's meta dict
subject_data.meta['subject_info'] = subject_info
# ask Scrapy to fetch additional data
yield subject_data
def parse_course(self, response):
# get back the data that was passed previously
subject_info = response.request.meta['subject_info']
subject_id = response.css('::attr(id)').extract_first().strip()
for course in response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
if course_link is not None:
course_data = scrapy.Request(course_link, callback=self.parse_class)
# build a dict with the data in this page
# + the data scraped previously
course_info = {
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'subject_info': subject_info,
}
# pass that data to the next callback
course_data.meta['course_info'] = subject_info
# fetch the class page
yield course_data
def parse_class(self, response):
# get course data from previous callbacks
course_info = response.request.meta['course_info']
course_id = response.css('::attr(id)').extract_first().strip()
for section in response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
yield {
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
'course_info': course_info
}
所以你不会得到包含课程的主题,它们本身包含部分,
而是部分,每个部分都有关于它们所属的课程的信息,它们自己有关于它们所涉及的主题的信息。
方法 2。(警告:我没有在实践中测试过,但它可能有效)
from inline_requests import inline_requests
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
# this decorator is important
@inline_requests
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
# this list will collect information on courses for this subject
subject_data = []
if subject_link is not None:
try:
# you ask scrapy to fetch the page
# but you do not set a callback
subject_response = yield scrapy.Request(subject_link)
# and you get a Response to work on when it's fetched,
# without going through a callback
subject_id = subject_response.css('::attr(id)').extract_first().strip()
for course in subject_response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
# this list will collect information on sections for this course
course_data = []
if course_link is not None:
try:
# same thing here, you ask Scrapy to fetch a Response
course_response = yield scrapy.Request(course_link)
course_id = course_response.css('::attr(id)').extract_first().strip()
for section in course_response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
# add each section item
course_data.append(
{
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
}
)
except:
raise
# add each course item
subject_data.append(
{
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'course_data': course_data,
}
)
except:
raise
yield {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
'subject_data': subject_data,
}
所以我有一个 scrapy spider 如下:
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
if subject_link is not None:
subject_data = scrapy.Request(subject_link, callback=self.parse_course)
yield {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
'subject_data': subject_data,
}
def parse_course(self, response):
subject_id = response.css('::attr(id)').extract_first().strip()
for course in response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
if course_link is not None:
course_data = scrapy.Request(course_link, callback=self.parse_class)
yield {
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'course_data': course_data,
}
def parse_class(self, response):
course_id = response.css('::attr(id)').extract_first().strip()
for section in response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
yield {
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
}
我想要一个输出 json 文件,它具有如下树结构:
{"subject_id": "...", "subject_name": "...", "subject_link": "...", "subject_data":
{"course_id": "...", "course_link": "...", "course_name": "...", "course_data":
{"course_id": "...", "section_link": "...", "section_name": "..."}
}
}
然而我只得到这个:
{"subject_id": "...", "subject_data": "<Request GET http://example.com/something>", "subject_name": "...", "subject_link": "..."}
据我了解,这是因为 yield 代码尚未执行。我将如何调用完全调用所有请求的 "scrapy crawl courses -o courses.json" 的等价物?如果这不可能开箱即用,我怎么能自己做呢?我以后可以在 python 文件中导入 json 和 运行 http://example.com/something> 以及以下文件吗?
我知道有很多代码,但应该澄清一下。 感谢您的帮助!
我看到了 2 种方法:
- 要么增量构建数据,然后使用
Request.meta
dict 将数据传递给每个回调。参见 Passing additional data to callback functions
或
- 使用类似scrapy-inline-requests的东西(待测试)
方法一
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
if subject_link is not None:
subject_data = scrapy.Request(subject_link, callback=self.parse_course)
# build a dict with the info we have so far
subject_info = {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
}
# add this to the Request's meta dict
subject_data.meta['subject_info'] = subject_info
# ask Scrapy to fetch additional data
yield subject_data
def parse_course(self, response):
# get back the data that was passed previously
subject_info = response.request.meta['subject_info']
subject_id = response.css('::attr(id)').extract_first().strip()
for course in response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
if course_link is not None:
course_data = scrapy.Request(course_link, callback=self.parse_class)
# build a dict with the data in this page
# + the data scraped previously
course_info = {
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'subject_info': subject_info,
}
# pass that data to the next callback
course_data.meta['course_info'] = subject_info
# fetch the class page
yield course_data
def parse_class(self, response):
# get course data from previous callbacks
course_info = response.request.meta['course_info']
course_id = response.css('::attr(id)').extract_first().strip()
for section in response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
yield {
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
'course_info': course_info
}
所以你不会得到包含课程的主题,它们本身包含部分, 而是部分,每个部分都有关于它们所属的课程的信息,它们自己有关于它们所涉及的主题的信息。
方法 2。(警告:我没有在实践中测试过,但它可能有效)
from inline_requests import inline_requests
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
# this decorator is important
@inline_requests
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
# this list will collect information on courses for this subject
subject_data = []
if subject_link is not None:
try:
# you ask scrapy to fetch the page
# but you do not set a callback
subject_response = yield scrapy.Request(subject_link)
# and you get a Response to work on when it's fetched,
# without going through a callback
subject_id = subject_response.css('::attr(id)').extract_first().strip()
for course in subject_response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
# this list will collect information on sections for this course
course_data = []
if course_link is not None:
try:
# same thing here, you ask Scrapy to fetch a Response
course_response = yield scrapy.Request(course_link)
course_id = course_response.css('::attr(id)').extract_first().strip()
for section in course_response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
# add each section item
course_data.append(
{
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
}
)
except:
raise
# add each course item
subject_data.append(
{
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'course_data': course_data,
}
)
except:
raise
yield {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
'subject_data': subject_data,
}