Scrapy:如何在单独的函数中加载多个项目?
Scrapy: How to load multiple items in separate function?
我想在与 parse_product
分开的函数中处理和加载项目,在我的例子中是 prepare_item_download()
函数。但是,当我 运行 我的蜘蛛时,我收到错误消息,它需要 return 请求、BaseItem、字典或 None 而不是生成器。当我将它留在 parse_product
函数中时它会起作用。
这是我的代码:
from scrapy import Request, Spider
from scrapy.loader import ItemLoader
from firmware.items import FirmwareItem
import re
class AvmSpider(Spider):
name = 'avm'
start_urls = [
'http://download.avm.de/fritzbox/',
'http://download.avm.de/fritzwlan/',
'http://download.avm.de/fritzpowerline/'
]
# parse top-level pages
def parse(self, response):
for product_url in self.link_extractor(response=response, prefix=('beta', 'tools', 'license', '..')):
yield Request(url=product_url, callback=self.parse_product)
# parse each product; call self as long as not in fritz.os directory
def parse_product(self, response):
path = response.request.url.split('/')[:-1]
if path[-1] == 'fritz.os':
yield self.prepare_item_download(response, path)
else:
for sub in self.link_extractor(response=response, prefix=('recover', '..')):
yield Request(url=response.urljoin(sub), callback=self.parse_product)
# get release dates, populate and load item
def prepare_item_download(self, response, path: str):
release_dates = self.date_extractor(response)
for index, file_url in enumerate(self.link_extractor(response=response, prefix='..')):
if file_url.endswith('.image'):
loader = ItemLoader(item=FirmwareItem(), selector=file_url)
loader.add_value('file_urls', file_url)
loader.add_value('vendor', 'avm')
loader.add_value('device_name', path[-3])
loader.add_value('device_class', path[-4])
loader.add_value('release_date', release_dates[index])
yield loader.load_item()
yield None
# return all links which do not start with a certain prefix
@staticmethod
def link_extractor(response, prefix) -> list:
return [response.urljoin(p) for p in response.xpath('//a/@href').extract() if not p.startswith(prefix)]
# return release dates of all images listed on current page
@staticmethod
def date_extractor(response) -> list:
release_dates = list()
for text in response.xpath('//pre/text()').extract():
match = re.search(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', text)
if match:
release_dates.append(match.group(1))
return release_dates
以上代码无效。您可以在下面看到工作代码:
def parse_product(self, response):
path = response.request.url.split('/')[:-1]
if path[-1] == 'fritz.os':
release_dates = self.date_extractor(response)
for index, file_url in enumerate(self.link_extractor(response=response, prefix='..')):
if file_url.endswith('.image'):
loader = ItemLoader(item=FirmwareItem(), selector=file_url)
loader.add_value('file_urls', file_url)
loader.add_value('vendor', 'avm')
loader.add_value('device_name', path[-3])
loader.add_value('device_class', path[-4])
loader.add_value('release_date', release_dates[index])
yield loader.load_item()
else:
for sub in self.link_extractor(response=response, prefix=('recover', '..')):
yield Request(url=response.urljoin(sub), callback=self.parse_product)
你 parse_product
函数生成一个 生成器:
def parse_product(self, response):
path = response.request.url.split('/')[:-1]
if path[-1] == 'fritz.os':
yield self.prepare_item_download(response, path)
^^^^^^^^^^^^^
相反,您应该使用 yield from
语句来解压生成器:
def parse_product(self, response):
path = response.request.url.split('/')[:-1]
if path[-1] == 'fritz.os':
yield from self.prepare_item_download(response, path)
^^^^
# or for python <3.3
for item in self.prepare_item_download(response, path):
yield item
我想在与 parse_product
分开的函数中处理和加载项目,在我的例子中是 prepare_item_download()
函数。但是,当我 运行 我的蜘蛛时,我收到错误消息,它需要 return 请求、BaseItem、字典或 None 而不是生成器。当我将它留在 parse_product
函数中时它会起作用。
这是我的代码:
from scrapy import Request, Spider
from scrapy.loader import ItemLoader
from firmware.items import FirmwareItem
import re
class AvmSpider(Spider):
name = 'avm'
start_urls = [
'http://download.avm.de/fritzbox/',
'http://download.avm.de/fritzwlan/',
'http://download.avm.de/fritzpowerline/'
]
# parse top-level pages
def parse(self, response):
for product_url in self.link_extractor(response=response, prefix=('beta', 'tools', 'license', '..')):
yield Request(url=product_url, callback=self.parse_product)
# parse each product; call self as long as not in fritz.os directory
def parse_product(self, response):
path = response.request.url.split('/')[:-1]
if path[-1] == 'fritz.os':
yield self.prepare_item_download(response, path)
else:
for sub in self.link_extractor(response=response, prefix=('recover', '..')):
yield Request(url=response.urljoin(sub), callback=self.parse_product)
# get release dates, populate and load item
def prepare_item_download(self, response, path: str):
release_dates = self.date_extractor(response)
for index, file_url in enumerate(self.link_extractor(response=response, prefix='..')):
if file_url.endswith('.image'):
loader = ItemLoader(item=FirmwareItem(), selector=file_url)
loader.add_value('file_urls', file_url)
loader.add_value('vendor', 'avm')
loader.add_value('device_name', path[-3])
loader.add_value('device_class', path[-4])
loader.add_value('release_date', release_dates[index])
yield loader.load_item()
yield None
# return all links which do not start with a certain prefix
@staticmethod
def link_extractor(response, prefix) -> list:
return [response.urljoin(p) for p in response.xpath('//a/@href').extract() if not p.startswith(prefix)]
# return release dates of all images listed on current page
@staticmethod
def date_extractor(response) -> list:
release_dates = list()
for text in response.xpath('//pre/text()').extract():
match = re.search(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', text)
if match:
release_dates.append(match.group(1))
return release_dates
以上代码无效。您可以在下面看到工作代码:
def parse_product(self, response):
path = response.request.url.split('/')[:-1]
if path[-1] == 'fritz.os':
release_dates = self.date_extractor(response)
for index, file_url in enumerate(self.link_extractor(response=response, prefix='..')):
if file_url.endswith('.image'):
loader = ItemLoader(item=FirmwareItem(), selector=file_url)
loader.add_value('file_urls', file_url)
loader.add_value('vendor', 'avm')
loader.add_value('device_name', path[-3])
loader.add_value('device_class', path[-4])
loader.add_value('release_date', release_dates[index])
yield loader.load_item()
else:
for sub in self.link_extractor(response=response, prefix=('recover', '..')):
yield Request(url=response.urljoin(sub), callback=self.parse_product)
你 parse_product
函数生成一个 生成器:
def parse_product(self, response):
path = response.request.url.split('/')[:-1]
if path[-1] == 'fritz.os':
yield self.prepare_item_download(response, path)
^^^^^^^^^^^^^
相反,您应该使用 yield from
语句来解压生成器:
def parse_product(self, response):
path = response.request.url.split('/')[:-1]
if path[-1] == 'fritz.os':
yield from self.prepare_item_download(response, path)
^^^^
# or for python <3.3
for item in self.prepare_item_download(response, path):
yield item