Scrapy reading urls as char values? (ValueError: Missing scheme in request url: h)
Scrapy reading urls as char values? (ValueError: Missing scheme in request url: h)
我正试图从维基网站上的 table 下载图像,但当我通过命令行 运行 项目时,它一直给我 "ValueError: Missing scheme in request url: h"。
zhimagespider.py
# -*- coding: utf-8 -*-
import scrapy
from zh.pipelines import ZhImagesPipeline
from zh.items import ImageItem
from utils import get_raw_image
class ZhImageSpider(scrapy.Spider):
name = 'zh'
allowed_domains = ['https://zh.battlegirl.wikia.com',
'https://vignette.wikia.nocookie.net/']
start_urls = ['https://zh.battlegirl.wikia.com/wiki/%E5%8D%A1%E7%89%87%E4%B8%80%E8%A6%BD']
def parse(self, response):
for row in response.xpath("//tr")[2:]:
# Initialize dictionary
item = ImageItem()
item['image_id'] = row.xpath('td[1]/text()').extract_first()
# Get icons
icons = row.css('td:nth-child(2)').xpath('.//@src').extract()
for icon in icons:
if icon.startswith('d'): # Or 'data'
icons.remove(icon)
item['image_urls'] = get_raw_image(icons[0])
yield item
回溯样本
2017-10-28 22:48:37 [scrapy.core.scraper] ERROR: Error processing {'image_id': '1',
'image_urls': 'https://vignette.wikia.nocookie.net/battlegirl/images/8/86/Card_10011_s.png/revision/latest?cb=20160212023217&path-prefix=zh'}
Traceback (most recent call last):
File "C:\Miniconda3\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Miniconda3\lib\site-packages\scrapy\pipelines\media.py", line 79, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "C:\Miniconda3\lib\site-packages\scrapy\pipelines\images.py", line 152, in get_media_requests
return [Request(x) for x in item.get(self.images_urls_field, [])]
File "C:\Miniconda3\lib\site-packages\scrapy\pipelines\images.py", line 152, in
return [Request(x) for x in item.get(self.images_urls_field, [])]
File "C:\Miniconda3\lib\site-packages\scrapy\http\request__init__.py", line 25, in init
self._set_url(url)
File "C:\Miniconda3\lib\site-packages\scrapy\http\request__init__.py", line 58, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
这是项目中的其他脚本:
items.py
# -*- coding: utf-8 -*-
import scrapy
class ImageItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
image_id = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class ZhImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
utils.py 只是一个脚本,它删除了图像 url 中调整维基上图标大小的部分:
def get_raw_image(url):
splitted = url.split('?')
if len(splitted) == 2:
return "?".join(["/".join(splitted[0].split("/")[0:-2])] +
[splitted[1]])
elif len(splitted) == 1:
return url
else:
raise ValueError('Not a resized Vignette image url: %s' %url)
似乎脚本正在读取 urls 作为字符值,但我不确定为什么?
根据文档,item['image_urls']
需要 list
,而在您的情况下,您将其存储为字符串。这就是为什么当您在管道中循环它时,您循环遍历以字母 h
开头的单个字符。这就是当你产生新的 Request
时 image_url
包含的内容,因此是错误。
我正试图从维基网站上的 table 下载图像,但当我通过命令行 运行 项目时,它一直给我 "ValueError: Missing scheme in request url: h"。
zhimagespider.py
# -*- coding: utf-8 -*-
import scrapy
from zh.pipelines import ZhImagesPipeline
from zh.items import ImageItem
from utils import get_raw_image
class ZhImageSpider(scrapy.Spider):
name = 'zh'
allowed_domains = ['https://zh.battlegirl.wikia.com',
'https://vignette.wikia.nocookie.net/']
start_urls = ['https://zh.battlegirl.wikia.com/wiki/%E5%8D%A1%E7%89%87%E4%B8%80%E8%A6%BD']
def parse(self, response):
for row in response.xpath("//tr")[2:]:
# Initialize dictionary
item = ImageItem()
item['image_id'] = row.xpath('td[1]/text()').extract_first()
# Get icons
icons = row.css('td:nth-child(2)').xpath('.//@src').extract()
for icon in icons:
if icon.startswith('d'): # Or 'data'
icons.remove(icon)
item['image_urls'] = get_raw_image(icons[0])
yield item
回溯样本
2017-10-28 22:48:37 [scrapy.core.scraper] ERROR: Error processing {'image_id': '1', 'image_urls': 'https://vignette.wikia.nocookie.net/battlegirl/images/8/86/Card_10011_s.png/revision/latest?cb=20160212023217&path-prefix=zh'}
Traceback (most recent call last):
File "C:\Miniconda3\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks current.result = callback(current.result, *args, **kw)
File "C:\Miniconda3\lib\site-packages\scrapy\pipelines\media.py", line 79, in process_item requests = arg_to_iter(self.get_media_requests(item, info))
File "C:\Miniconda3\lib\site-packages\scrapy\pipelines\images.py", line 152, in get_media_requests return [Request(x) for x in item.get(self.images_urls_field, [])]
File "C:\Miniconda3\lib\site-packages\scrapy\pipelines\images.py", line 152, in return [Request(x) for x in item.get(self.images_urls_field, [])]
File "C:\Miniconda3\lib\site-packages\scrapy\http\request__init__.py", line 25, in init self._set_url(url)
File "C:\Miniconda3\lib\site-packages\scrapy\http\request__init__.py", line 58, in _set_url raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
这是项目中的其他脚本:
items.py
# -*- coding: utf-8 -*-
import scrapy
class ImageItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
image_id = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class ZhImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
utils.py 只是一个脚本,它删除了图像 url 中调整维基上图标大小的部分:
def get_raw_image(url):
splitted = url.split('?')
if len(splitted) == 2:
return "?".join(["/".join(splitted[0].split("/")[0:-2])] +
[splitted[1]])
elif len(splitted) == 1:
return url
else:
raise ValueError('Not a resized Vignette image url: %s' %url)
似乎脚本正在读取 urls 作为字符值,但我不确定为什么?
item['image_urls']
需要 list
,而在您的情况下,您将其存储为字符串。这就是为什么当您在管道中循环它时,您循环遍历以字母 h
开头的单个字符。这就是当你产生新的 Request
时 image_url
包含的内容,因此是错误。