如何使用多处理来加速 bs4 抓取和图像下载
How can I use multiprocessing to speed up bs4 scraping and image downloading
所以我有这段代码:
from bs4 import *
import requests
import os
import pandas
df = pandas.read_csv(r'C:\Users\fani\Desktop\History.csv')
folder_name = "downloadedpics"
os.mkdir(folder_name)
z=1
for j in df['url']:
# DOWNLOAD ALL IMAGES FROM THAT URL
def download_images(images, folder_name):
# initial count is zero
count = 0
# print total images found in URL
print(f"Total {len(images)} Image Found!")
# checking if images is not zero
if len(images) != 0:
for i, image in enumerate(images):
# From image tag ,Fetch image Source URL
# 1.data-srcset
# 2.data-src
# 3.data-fallback-src
# 4.src
# Here we will use exception handling
# first we will search for "data-srcset" in img tag
try:
# In image tag ,searching for "data-srcset"
image_link = image["data-srcset"]
# then we will search for "data-src" in img
# tag and so on..
except:
try:
# In image tag ,searching for "data-src"
image_link = image["data-src"]
except:
try:
# In image tag ,searching for "data-fallback-src"
image_link = image["data-fallback-src"]
except:
try:
# In image tag ,searching for "src"
image_link = image["src"]
# if no Source URL found
except:
pass
# After getting Image Source URL
# We will try to get the content of image
try:
r = requests.get(image_link).content
with open(f"{folder_name}/{z}images{i + 1}.jpg", "wb+") as f:
f.write(r)
# counting number of image downloaded
count += 1
except:
pass
# There might be possible, that all
# images not download
# if all images download
if count == len(images):
print("All Images Downloaded!")
# if all images not download
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
# MAIN FUNCTION START
def main(url):
# content of URL
r = requests.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
# find all images in URL
images = soup.findAll('img', class_='pannable-image')
# Call folder create function
download_images(images, folder_name)
# take url
url = j
# CALL MAIN FUNCTION
main(url)
print(z)
z = z + 1
它抓取了一堆 url(在 history.csv 中列出)并从中下载了一些图像。
唯一的问题是这样一个简单的任务真的很慢。
实施多处理以加快速度的正确方法是什么?
我是新手,我不知道多处理是如何工作的
编辑:
这是 csv 文件:
mega link
该代码应该从 1648 个网页(该电子商务网站页面的图库部分)下载大约 12000 张图像,相当于大约 1GB 的数据
因为您已经在使用 requests
包,继续的明显方法是使用 multithreading
而不是 asyncio
,这将要求您放弃 requests
并学习 aiohttp
.
我已经对代码进行了大量重组,但由于无法访问您的 CSV 文件而无法对其进行测试,我强烈建议您查看我所做的工作并尝试尽可能地理解它通过阅读 Python 文档了解各种 类 和您不熟悉的方法。我不明白的是为什么当你检索一个图像文件时你试图解码它。我想您希望这会产生错误,但这似乎是在浪费时间。
我任意将多线程池大小设置为 100(多线程可以轻松处理大几倍的池大小,尽管 asyncio 可以处理数千个并发任务)。将 N_THREADS
设置为 URL 的数量乘以每个 URL 您需要下载的图像的平均数量,但不要超过 500。
from bs4 import *
import requests
import os
import pandas
from multiprocessing.pool import ThreadPool
from functools import partial
from threading import Lock
class FileIndex:
"""
Increment and return the next index to use for creating a file
that is threadsafe.
"""
def __init__(self):
self._lock = Lock()
self._file_index = 0
@property
def next_file_index(self):
with self._lock:
self._file_index += 1
return self._file_index
# DOWNLOAD AN IMAGE FROM THAT URL
def download_image(image, session, file_index, folder_number, folder_name):
# From image tag ,Fetch image Source URL
# 1.data-srcset
# 2.data-src
# 3.data-fallback-src
# 4.src
# Here we will use exception handling
# first we will search for "data-srcset" in img tag
try:
# In image tag ,searching for "data-srcset"
image_link = image["data-srcset"]
# then we will search for "data-src" in img
# tag and so on..
except:
try:
# In image tag ,searching for "data-src"
image_link = image["data-src"]
except:
try:
# In image tag ,searching for "data-fallback-src"
image_link = image["data-fallback-src"]
except:
try:
# In image tag ,searching for "src"
image_link = image["src"]
# if no Source URL found
except:
return 0 # no image loaded
# After getting Image Source URL
# We will try to get the content of image
try:
r = session.get(image_link).content
# Why are you trying to decode an image?
try:
# possibility of decode
r = str(r, 'utf-8')
return 0 # no error return 0 ?????
except UnicodeDecodeError:
# After checking above condition, Image Download start
with open(f"{folder_name}/{folder_number}images{file_index.next_file_index}.jpg", "wb+") as f:
f.write(r)
# counting number of image downloaded
return 1 # 1 downloaded
except:
return 0 # 0 downloaded
# download_url FUNCTION START
def download_url(folder_number, url, session, folder_name, thread_pool):
# content of URL
r = session.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
# find all images in URL
images = soup.findAll('img', class_='pannable-image')
# Call folder create function
worker = partial(download_image,
session=session,
file_index=FileIndex(),
folder_number=folder_number,
folder_name=folder_name)
counts = thread_pool.map(worker, images)
total_counts = sum(counts)
if total_counts == len(images):
print(f"All Images Downloaded for URL {url}!")
else:
print(f"Total {total_counts} Images Downloaded Out of {len(images)} for URL {url}")
# The real main function:
def main():
df = pandas.read_csv(r'C:\Users\fani\Desktop\History.csv')
folder_name = "downloadedpics"
os.mkdir(folder_name)
N_THREADS_URLS = 50 # or some suitable size for retrieving URLS
N_THREADS_IMAGES = 500 # or some suitable size for retrieving images
# use a session for efficiency:
with requests.Session() as session, \
ThreadPool(N_THREADS_URLS) as thread_pool_urls, \
ThreadPool(N_THREADS_IMAGES) as thread_pool_images:
worker = partial(download_url,
session=session,
folder_name=folder_name,
thread_pool=thread_pool_images)
results = thread_pool_urls.starmap(worker, enumerate(df))
if __name__ == '__main__':
main()
所以我有这段代码:
from bs4 import *
import requests
import os
import pandas
df = pandas.read_csv(r'C:\Users\fani\Desktop\History.csv')
folder_name = "downloadedpics"
os.mkdir(folder_name)
z=1
for j in df['url']:
# DOWNLOAD ALL IMAGES FROM THAT URL
def download_images(images, folder_name):
# initial count is zero
count = 0
# print total images found in URL
print(f"Total {len(images)} Image Found!")
# checking if images is not zero
if len(images) != 0:
for i, image in enumerate(images):
# From image tag ,Fetch image Source URL
# 1.data-srcset
# 2.data-src
# 3.data-fallback-src
# 4.src
# Here we will use exception handling
# first we will search for "data-srcset" in img tag
try:
# In image tag ,searching for "data-srcset"
image_link = image["data-srcset"]
# then we will search for "data-src" in img
# tag and so on..
except:
try:
# In image tag ,searching for "data-src"
image_link = image["data-src"]
except:
try:
# In image tag ,searching for "data-fallback-src"
image_link = image["data-fallback-src"]
except:
try:
# In image tag ,searching for "src"
image_link = image["src"]
# if no Source URL found
except:
pass
# After getting Image Source URL
# We will try to get the content of image
try:
r = requests.get(image_link).content
with open(f"{folder_name}/{z}images{i + 1}.jpg", "wb+") as f:
f.write(r)
# counting number of image downloaded
count += 1
except:
pass
# There might be possible, that all
# images not download
# if all images download
if count == len(images):
print("All Images Downloaded!")
# if all images not download
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
# MAIN FUNCTION START
def main(url):
# content of URL
r = requests.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
# find all images in URL
images = soup.findAll('img', class_='pannable-image')
# Call folder create function
download_images(images, folder_name)
# take url
url = j
# CALL MAIN FUNCTION
main(url)
print(z)
z = z + 1
它抓取了一堆 url(在 history.csv 中列出)并从中下载了一些图像。 唯一的问题是这样一个简单的任务真的很慢。 实施多处理以加快速度的正确方法是什么? 我是新手,我不知道多处理是如何工作的
编辑: 这是 csv 文件: mega link
该代码应该从 1648 个网页(该电子商务网站页面的图库部分)下载大约 12000 张图像,相当于大约 1GB 的数据
因为您已经在使用 requests
包,继续的明显方法是使用 multithreading
而不是 asyncio
,这将要求您放弃 requests
并学习 aiohttp
.
我已经对代码进行了大量重组,但由于无法访问您的 CSV 文件而无法对其进行测试,我强烈建议您查看我所做的工作并尝试尽可能地理解它通过阅读 Python 文档了解各种 类 和您不熟悉的方法。我不明白的是为什么当你检索一个图像文件时你试图解码它。我想您希望这会产生错误,但这似乎是在浪费时间。
我任意将多线程池大小设置为 100(多线程可以轻松处理大几倍的池大小,尽管 asyncio 可以处理数千个并发任务)。将 N_THREADS
设置为 URL 的数量乘以每个 URL 您需要下载的图像的平均数量,但不要超过 500。
from bs4 import *
import requests
import os
import pandas
from multiprocessing.pool import ThreadPool
from functools import partial
from threading import Lock
class FileIndex:
"""
Increment and return the next index to use for creating a file
that is threadsafe.
"""
def __init__(self):
self._lock = Lock()
self._file_index = 0
@property
def next_file_index(self):
with self._lock:
self._file_index += 1
return self._file_index
# DOWNLOAD AN IMAGE FROM THAT URL
def download_image(image, session, file_index, folder_number, folder_name):
# From image tag ,Fetch image Source URL
# 1.data-srcset
# 2.data-src
# 3.data-fallback-src
# 4.src
# Here we will use exception handling
# first we will search for "data-srcset" in img tag
try:
# In image tag ,searching for "data-srcset"
image_link = image["data-srcset"]
# then we will search for "data-src" in img
# tag and so on..
except:
try:
# In image tag ,searching for "data-src"
image_link = image["data-src"]
except:
try:
# In image tag ,searching for "data-fallback-src"
image_link = image["data-fallback-src"]
except:
try:
# In image tag ,searching for "src"
image_link = image["src"]
# if no Source URL found
except:
return 0 # no image loaded
# After getting Image Source URL
# We will try to get the content of image
try:
r = session.get(image_link).content
# Why are you trying to decode an image?
try:
# possibility of decode
r = str(r, 'utf-8')
return 0 # no error return 0 ?????
except UnicodeDecodeError:
# After checking above condition, Image Download start
with open(f"{folder_name}/{folder_number}images{file_index.next_file_index}.jpg", "wb+") as f:
f.write(r)
# counting number of image downloaded
return 1 # 1 downloaded
except:
return 0 # 0 downloaded
# download_url FUNCTION START
def download_url(folder_number, url, session, folder_name, thread_pool):
# content of URL
r = session.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
# find all images in URL
images = soup.findAll('img', class_='pannable-image')
# Call folder create function
worker = partial(download_image,
session=session,
file_index=FileIndex(),
folder_number=folder_number,
folder_name=folder_name)
counts = thread_pool.map(worker, images)
total_counts = sum(counts)
if total_counts == len(images):
print(f"All Images Downloaded for URL {url}!")
else:
print(f"Total {total_counts} Images Downloaded Out of {len(images)} for URL {url}")
# The real main function:
def main():
df = pandas.read_csv(r'C:\Users\fani\Desktop\History.csv')
folder_name = "downloadedpics"
os.mkdir(folder_name)
N_THREADS_URLS = 50 # or some suitable size for retrieving URLS
N_THREADS_IMAGES = 500 # or some suitable size for retrieving images
# use a session for efficiency:
with requests.Session() as session, \
ThreadPool(N_THREADS_URLS) as thread_pool_urls, \
ThreadPool(N_THREADS_IMAGES) as thread_pool_images:
worker = partial(download_url,
session=session,
folder_name=folder_name,
thread_pool=thread_pool_images)
results = thread_pool_urls.starmap(worker, enumerate(df))
if __name__ == '__main__':
main()