python 异步图片下载(多个 url)
python asyncronous images download (multiple urls)
我正在学习 Python 4/5 个月,这是我从头开始构建的第三个项目,但我无法独自解决这个问题。
此脚本为每个给定的 url 下载 1 张图像。
我无法找到有关如何在此脚本中实现线程池执行器或异步的解决方案。我不知道如何将带有图像编号的 link url 保存到图像部分。
我为我需要下载的所有 url 构建了一个字典,但我如何实际保存具有正确名称的图像?
还有其他建议吗?
PS。目前出现的url个都是假的
同步版本:
import requests
import argparse
import re
import os
import logging
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser()
parser.add_argument("-n", "--num", help="Book number", type=int, required=True)
parser.add_argument("-p", dest=r"path_name", default=r"F:\Users3", help="Save to dir", )
args = parser.parse_args()
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.ERROR)
logger = logging.getLogger(__name__)
def get_parser(url_c):
url = f'https://test.net/g/{url_c}/1'
logger.info(f'Main url: {url_c}')
responce = requests.get(url, timeout=5) # timeout will raise an exeption
if responce.status_code == 200:
page = requests.get(url, timeout=5).content
soup = BeautifulSoup(page, 'html.parser')
return soup
else:
responce.raise_for_status()
def get_locators(soup): # take get_parser
# Extract first/last page num
first = int(soup.select_one('span.current').string)
logger.info(f'First page: {first}')
last = int(soup.select_one('span.num-pages').string) + 1
# Extract img_code and extension
link = soup.find('img', {'class': 'fit-horizontal'}).attrs["src"]
logger.info(f'Locator code: {link}')
code = re.search('galleries.([0-9]+)\/.\.(\w{3})', link)
book_code = code.group(1) # internal code
extension = code.group(2) # png or jpg
# extract Dir book name
pattern = re.compile('pretty":"(.*)"')
found = soup.find('script', text=pattern)
string = pattern.search(found.text).group(1)
dir_name = string.split('"')[0]
logger.info(f'Dir name: {dir_name}')
logger.info(f'Hidden code: {book_code}')
print(f'Extension: {extension}')
print(f'Tot pages: {last}')
print(f'')
return {'first_p': first,
'last_p': last,
'book_code': book_code,
'ext': extension,
'dir': dir_name
}
def setup_download_dir(path, dir): # (args.path_name, locator['dir'])
# Make folder if it not exist
filepath = os.path.join(f'{path}\{dir}')
if not os.path.exists(filepath):
try:
os.makedirs(filepath)
print(f'Directory created at: {filepath}')
except OSError as err:
print(f"Can't create {filepath}: {err}")
return filepath
def main(locator, filepath):
for image_n in range(locator['first_p'], locator['last_p']):
url = f"https://i.test.net/galleries/{locator['book_code']}/{image_n}.{locator['ext']}"
logger.info(f'Url Img: {url}')
responce = requests.get(url, timeout=3)
if responce.status_code == 200:
img_data = requests.get(url, timeout=3).content
else:
responce.raise_for_status() # raise exepetion
with open((os.path.join(filepath, f"{image_n}.{locator['ext']}")), 'wb') as handler:
handler.write(img_data) # write image
print(f'Img {image_n} - DONE')
if __name__ == '__main__':
try:
locator = get_locators(get_parser(args.num)) # args.num ex. 241461
main(locator, setup_download_dir(args.path_name, locator['dir']))
except KeyboardInterrupt:
print(f'Program aborted...' + '\n')
网址列表:
def img_links(locator):
image_url = []
for num in range(locator['first_p'], locator['last_p']):
url = f"https://i.test.net/galleries/{locator['book_code']}/{num}.{locator['ext']}"
image_url.append(url)
logger.info(f'Url List: {image_url}')
return image_url
我在流利的书中找到了解决方案python。这里是片段:
def download_many(cc_list, base_url, verbose, concur_req):
counter = collections.Counter()
with futures.ThreadPoolExecutor(max_workers=concur_req) as executor:
to_do_map = {}
for cc in sorted(cc_list):
future = executor.submit(download_one, cc, base_url, verbose)
to_do_map[future] = cc
done_iter = futures.as_completed(to_do_map)
if not verbose:
done_iter = tqdm.tqdm(done_iter, total=len(cc_list))
for future in done_iter:
try:
res = future.result()
except requests.exceptions.HTTPError as exc:
error_msg = 'HTTP {res.status_code} - {res.reason}'
error_msg = error_msg.format(res=exc.response)
except requests.exceptions.ConnectionError as exc:
error_msg = 'Connection error'
else:
error_msg = ''
status = res.status
if error_msg:
status = HTTPStatus.error
counter[status] += 1
if verbose and error_msg:
cc = to_do_map[future]
print('*** Error for {}: {}'.format(cc, error_msg))
return counter
我正在学习 Python 4/5 个月,这是我从头开始构建的第三个项目,但我无法独自解决这个问题。
此脚本为每个给定的 url 下载 1 张图像。 我无法找到有关如何在此脚本中实现线程池执行器或异步的解决方案。我不知道如何将带有图像编号的 link url 保存到图像部分。 我为我需要下载的所有 url 构建了一个字典,但我如何实际保存具有正确名称的图像? 还有其他建议吗?
PS。目前出现的url个都是假的
同步版本:
import requests import argparse import re import os import logging from bs4 import BeautifulSoup parser = argparse.ArgumentParser() parser.add_argument("-n", "--num", help="Book number", type=int, required=True) parser.add_argument("-p", dest=r"path_name", default=r"F:\Users3", help="Save to dir", ) args = parser.parse_args() logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.ERROR) logger = logging.getLogger(__name__) def get_parser(url_c): url = f'https://test.net/g/{url_c}/1' logger.info(f'Main url: {url_c}') responce = requests.get(url, timeout=5) # timeout will raise an exeption if responce.status_code == 200: page = requests.get(url, timeout=5).content soup = BeautifulSoup(page, 'html.parser') return soup else: responce.raise_for_status() def get_locators(soup): # take get_parser # Extract first/last page num first = int(soup.select_one('span.current').string) logger.info(f'First page: {first}') last = int(soup.select_one('span.num-pages').string) + 1 # Extract img_code and extension link = soup.find('img', {'class': 'fit-horizontal'}).attrs["src"] logger.info(f'Locator code: {link}') code = re.search('galleries.([0-9]+)\/.\.(\w{3})', link) book_code = code.group(1) # internal code extension = code.group(2) # png or jpg # extract Dir book name pattern = re.compile('pretty":"(.*)"') found = soup.find('script', text=pattern) string = pattern.search(found.text).group(1) dir_name = string.split('"')[0] logger.info(f'Dir name: {dir_name}') logger.info(f'Hidden code: {book_code}') print(f'Extension: {extension}') print(f'Tot pages: {last}') print(f'') return {'first_p': first, 'last_p': last, 'book_code': book_code, 'ext': extension, 'dir': dir_name } def setup_download_dir(path, dir): # (args.path_name, locator['dir']) # Make folder if it not exist filepath = os.path.join(f'{path}\{dir}') if not os.path.exists(filepath): try: os.makedirs(filepath) print(f'Directory created at: {filepath}') except OSError as err: print(f"Can't create {filepath}: {err}") return filepath def main(locator, filepath): for image_n in range(locator['first_p'], locator['last_p']): url = f"https://i.test.net/galleries/{locator['book_code']}/{image_n}.{locator['ext']}" logger.info(f'Url Img: {url}') responce = requests.get(url, timeout=3) if responce.status_code == 200: img_data = requests.get(url, timeout=3).content else: responce.raise_for_status() # raise exepetion with open((os.path.join(filepath, f"{image_n}.{locator['ext']}")), 'wb') as handler: handler.write(img_data) # write image print(f'Img {image_n} - DONE') if __name__ == '__main__': try: locator = get_locators(get_parser(args.num)) # args.num ex. 241461 main(locator, setup_download_dir(args.path_name, locator['dir'])) except KeyboardInterrupt: print(f'Program aborted...' + '\n')
网址列表:
def img_links(locator): image_url = [] for num in range(locator['first_p'], locator['last_p']): url = f"https://i.test.net/galleries/{locator['book_code']}/{num}.{locator['ext']}" image_url.append(url) logger.info(f'Url List: {image_url}') return image_url
我在流利的书中找到了解决方案python。这里是片段:
def download_many(cc_list, base_url, verbose, concur_req): counter = collections.Counter() with futures.ThreadPoolExecutor(max_workers=concur_req) as executor: to_do_map = {} for cc in sorted(cc_list): future = executor.submit(download_one, cc, base_url, verbose) to_do_map[future] = cc done_iter = futures.as_completed(to_do_map) if not verbose: done_iter = tqdm.tqdm(done_iter, total=len(cc_list)) for future in done_iter: try: res = future.result() except requests.exceptions.HTTPError as exc: error_msg = 'HTTP {res.status_code} - {res.reason}' error_msg = error_msg.format(res=exc.response) except requests.exceptions.ConnectionError as exc: error_msg = 'Connection error' else: error_msg = '' status = res.status if error_msg: status = HTTPStatus.error counter[status] += 1 if verbose and error_msg: cc = to_do_map[future] print('*** Error for {}: {}'.format(cc, error_msg)) return counter