在 python 中下载批量图像

Download bulk images in python

看完有关如何使用 python 下载图片的视频后,我在视频中输入了代码,这是代码

import pandas as pd
import urllib.request

def url_to_jpg(i, url, file_path):
    filename = 'image-{}.jpg'.format(i)
    fullpath = '{}{}'.format(file_path, filename)
    print(fullpath)
    urllib.request.urlretrieve(url, fullpath)
    print('{} saved.'.format(filename))
    return None

FILENAME = 'Images URLs.csv'
FILE_PATH = 'Images/'
urls = pd.read_csv(FILENAME)

for i, url in enumerate(urls.values):
    url_to_jpg(i, url, FILE_PATH)

测试代码时,我在这一行遇到了错误 urllib.request.urlretrieve(url, fullpath) 就是这样

Images/image-0.jpg
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-36-d92ed57d1d8e> in <module>
     15 
     16 for i, url in enumerate(urls.values):
---> 17     url_to_jpg(i, url, FILE_PATH)

<ipython-input-36-d92ed57d1d8e> in url_to_jpg(i, url, file_path)
      6     fullpath = '{}{}'.format(file_path, filename)
      7     print(fullpath)
----> 8     urllib.request.urlretrieve(url, fullpath)
      9     print('{} saved.'.format(filename))
     10     return None

C:\ProgramData\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    243     data file as well as the resulting HTTPMessage object.
    244     """
--> 245     url_type, path = _splittype(url)
    246 
    247     with contextlib.closing(urlopen(url, data)) as fp:

C:\ProgramData\Anaconda3\lib\urllib\parse.py in _splittype(url)
   1006         _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
   1007 
-> 1008     match = _typeprog.match(url)
   1009     if match:
   1010         scheme, data = match.groups()

TypeError: cannot use a string pattern on a bytes-like object

关于这个错误有什么想法吗?

** 我找到了修改这一行的点的解决方案 url_to_jpg(i, url[0], FILE_PATH)

但似乎有些链接是不允许的,因为我又遇到了另一个错误 HTTPError: HTTP Error 403: Forbidden 我该如何克服这个问题?

** 我尝试按照建议添加 headers(代理),但不知道如何正确完成。在那种情况下如何使用 urlretrieve

import urllib.request

hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

response = urllib.request.Request("http://www.gunnerkrigg.com//comics/00000001.jpg", headers=hdr)
print(urllib.request.urlopen(response))
urllib.request.urlretrieve(urllib.request.urlopen(response).read(),'oo.jpg')
#urllib.request.urlretrieve("http://www.gunnerkrigg.com//comics/00000001.jpg", "00000001.jpg")

此代码将帮助您克服 HTTPError: HTTP Error 403: Forbidden

这是您的代码的 header 添加版本。

import pandas as pd
import urllib.request

# build an opener
opener = urllib.request.build_opener()

# add a header for opener
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7')]

# install opener once
urllib.request.install_opener(opener)

def url_to_jpg(i, url, file_path):
    filename = 'image-{}.jpg'.format(i)
    fullpath = '{}{}'.format(file_path, filename)
    print(fullpath)
    urllib.request.urlretrieve(url, fullpath)
    print('{} saved.'.format(filename))
    return None

FILENAME = 'Images URLs.csv'
FILE_PATH = 'Images/'
urls = pd.read_csv(FILENAME)

for i, url in enumerate(urls.values):
    url_to_jpg(i, url[0], FILE_PATH)