Scrapy:如何有效地跟踪具有相似 css 选择器的嵌套链接?
Scrapy: How to efficiently follow nested links with similar css selectors?
我有类似于以下代码的内容。我知道在这个例子中可以直接导航到 the yourself tag page,但在我的应用程序中我需要转到第 1 页才能让 link 转到第 2 页,我需要第 2 页的 links 才能到达第 3 页等(即 url 不遵循特定模式)。
import scrapy
class SampleSpider(scrapy.Spider):
name = "sample"
start_urls = [
"https://quotes.toscrape.com/",
]
def parse(self, response):
links = response.css(
'a[class="tag"][href*=inspirational]::attr(href)'
).extract()
for link in links:
yield response.follow(link, self.parse_inspirational)
def parse_inspirational(self, response):
links = response.css('a[class="tag"][href*=life]::attr(href)').extract()
for link in links:
yield response.follow(link, self.parse_life)
def parse_life(self, response):
links = response.css('a[class="tag"][href*=yourself]::attr(href)').extract()
for link in links:
yield response.follow(link, self.parse_yourself)
def parse_yourself(self, response):
for resp in response.css('span[itemprop="text"]::text').extract():
print(resp)
由于遵循 link 并寻找新的 css 模式的相同模式重复了 3 次,我想编写一个函数来遍历列表css 个字符串并递归地产生响应 。这是我想到的,但它不起作用。我期待打印出与 original/long-version 代码相同的输出:
def parse_recurse(self, response, css_str=None):
links = response.css(css_str.pop(0)).extract()
for link in links:
yield response.follow(link, callback=self.parse_recurse, cb_kwargs={"css_str":css_str})
def parse(self, response):
css = ['a[class="tag"][href*=inspirational]::attr(href)',
'a[class="tag"][href*=life]::attr(href)',
'a[class="tag"][href*=yourself]::attr(href)']
response = self.parse_recurse(response, css_str=css)
for resp in response.css('span[itemprop="text"]::text').extract():
print(resp)
你不能做 response = self.parse_recurse(...)
,因为 parse_recurse
只产生 request
,而不是 response
。
通常函数 yield request
和 Scrapy 捕获它并将 request
发送到 engine
稍后将 request
发送到服务器,ang get response
从服务器,并用这个 response
.
执行 callback
查看文档中的详细信息:Architecture overview
您必须使用 start_requests
到 运行 parse_request
和列表 css
,以及
它应该检查 css
是否不为空。如果 css
不为空,则产生带有回调 parse_requests
和较小 css
的请求(因此它是 运行 递归)。如果 css
为空,那么它应该产生带有回调 parse
的请求,这将获得文本。
import scrapy
class SampleSpider(scrapy.Spider):
name = "sample"
start_urls = ["https://quotes.toscrape.com/"]
road = [
'a[class="tag"][href*=inspirational]::attr(href)',
'a[class="tag"][href*=life]::attr(href)',
'a[class="tag"][href*=yourself]::attr(href)',
]
def start_requests(self):
"""Run starting URL with full road."""
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse_recurse, cb_kwargs={"road": self.road})
def parse_recurse(self, response, road):
"""If road is not empty then send to parse_recurse with smaller road.
If road is empty then send to parse."""
first = road[0]
rest = road[1:]
links = response.css(first).extract()
if rest:
# repeat recursion
for link in links:
yield response.follow(link, callback=self.parse_recurse, cb_kwargs={"road": rest})
else:
# exit recursion
for link in links:
yield response.follow(link, callback=self.parse)
def parse(self, response):
for resp in response.css('span[itemprop="text"]::text').extract():
print(resp)
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(SampleSpider)
c.start()
我有类似于以下代码的内容。我知道在这个例子中可以直接导航到 the yourself tag page,但在我的应用程序中我需要转到第 1 页才能让 link 转到第 2 页,我需要第 2 页的 links 才能到达第 3 页等(即 url 不遵循特定模式)。
import scrapy
class SampleSpider(scrapy.Spider):
name = "sample"
start_urls = [
"https://quotes.toscrape.com/",
]
def parse(self, response):
links = response.css(
'a[class="tag"][href*=inspirational]::attr(href)'
).extract()
for link in links:
yield response.follow(link, self.parse_inspirational)
def parse_inspirational(self, response):
links = response.css('a[class="tag"][href*=life]::attr(href)').extract()
for link in links:
yield response.follow(link, self.parse_life)
def parse_life(self, response):
links = response.css('a[class="tag"][href*=yourself]::attr(href)').extract()
for link in links:
yield response.follow(link, self.parse_yourself)
def parse_yourself(self, response):
for resp in response.css('span[itemprop="text"]::text').extract():
print(resp)
由于遵循 link 并寻找新的 css 模式的相同模式重复了 3 次,我想编写一个函数来遍历列表css 个字符串并递归地产生响应 。这是我想到的,但它不起作用。我期待打印出与 original/long-version 代码相同的输出:
def parse_recurse(self, response, css_str=None):
links = response.css(css_str.pop(0)).extract()
for link in links:
yield response.follow(link, callback=self.parse_recurse, cb_kwargs={"css_str":css_str})
def parse(self, response):
css = ['a[class="tag"][href*=inspirational]::attr(href)',
'a[class="tag"][href*=life]::attr(href)',
'a[class="tag"][href*=yourself]::attr(href)']
response = self.parse_recurse(response, css_str=css)
for resp in response.css('span[itemprop="text"]::text').extract():
print(resp)
你不能做 response = self.parse_recurse(...)
,因为 parse_recurse
只产生 request
,而不是 response
。
通常函数 yield request
和 Scrapy 捕获它并将 request
发送到 engine
稍后将 request
发送到服务器,ang get response
从服务器,并用这个 response
.
callback
查看文档中的详细信息:Architecture overview
您必须使用 start_requests
到 运行 parse_request
和列表 css
,以及
它应该检查 css
是否不为空。如果 css
不为空,则产生带有回调 parse_requests
和较小 css
的请求(因此它是 运行 递归)。如果 css
为空,那么它应该产生带有回调 parse
的请求,这将获得文本。
import scrapy
class SampleSpider(scrapy.Spider):
name = "sample"
start_urls = ["https://quotes.toscrape.com/"]
road = [
'a[class="tag"][href*=inspirational]::attr(href)',
'a[class="tag"][href*=life]::attr(href)',
'a[class="tag"][href*=yourself]::attr(href)',
]
def start_requests(self):
"""Run starting URL with full road."""
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse_recurse, cb_kwargs={"road": self.road})
def parse_recurse(self, response, road):
"""If road is not empty then send to parse_recurse with smaller road.
If road is empty then send to parse."""
first = road[0]
rest = road[1:]
links = response.css(first).extract()
if rest:
# repeat recursion
for link in links:
yield response.follow(link, callback=self.parse_recurse, cb_kwargs={"road": rest})
else:
# exit recursion
for link in links:
yield response.follow(link, callback=self.parse)
def parse(self, response):
for resp in response.css('span[itemprop="text"]::text').extract():
print(resp)
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(SampleSpider)
c.start()