使用集成 Python 的多处理中的 Pool.map 时,程序运行速度越来越慢

When using Pool.map from integrated Python's multiprocessing, program works slower and slower

这里有一个类似的问题

使用池的代码示例:

from multiprocessing import Pool
Pool(processes=6).map(some_func, array)

经过几次迭代后,程序变慢了,最终变得比没有多处理时还要慢。也许问题是与Selenium相关的功能? 这是完整的代码:

# libraries
import os
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from multiprocessing import Pool

# Необходимые переменные
url = "https://eldorado.ua/"
directory = os.path.dirname(os.path.realpath(__file__))
env_path = directory + "\chromedriver"
chromedriver_path = env_path + "\chromedriver.exe"

dict1 = {"Смартфоны и телефоны": "https://eldorado.ua/node/c1038944/",
         "Телевизоры и аудиотехника": "https://eldorado.ua/node/c1038957/",
         "Ноутбуки, ПК и Планшеты": "https://eldorado.ua/node/c1038958/",
         "Техника для кухни": "https://eldorado.ua/node/c1088594/",
         "Техника для дома": "https://eldorado.ua/node/c1088603/",
         "Игровая зона": "https://eldorado.ua/node/c1285101/",
         "Гаджеты и аксесуары": "https://eldorado.ua/node/c1215257/",
         "Посуда": "https://eldorado.ua/node/c1039055/",
         "Фото и видео": "https://eldorado.ua/node/c1038960/",
         "Красота и здоровье": "https://eldorado.ua/node/c1178596/",
         "Авто и инструменты": "https://eldorado.ua/node/c1284654/",
         "Спорт и туризм": "https://eldorado.ua/node/c1218544/",
         "Товары для дома и сада": "https://eldorado.ua/node/c1285161/",
         "Товары для детей": "https://eldorado.ua/node/c1085100/"}


def openChrome_headless(url1, name):
    options = webdriver.ChromeOptions()
    options.headless = True
    options.add_experimental_option("excludeSwitches", ['enable-automation'])
    options.add_argument(
        '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"')
    driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
    driver.get(url=url1)
    sleep(1)
    try:
        with open(name + ".html", "w", encoding="utf-8") as file:
            file.write(driver.page_source)
    except Exception as ex:
        print(ex)
    finally:
        driver.close()
        driver.quit()


def processing_goods_pages(name):
    for n in os.listdir(f"brand_pages\{name}"):
        with open(f"{directory}\brand_pages\{name}\{n}", encoding="utf-8") as file:
            soup = BeautifulSoup(file.read(), "lxml")

        if not os.path.exists(f"{directory}\goods_pages\{name}\{n[:-5]}"):
            if not os.path.exists(f"{directory}\goods_pages\{name}"):
                os.mkdir(f"{directory}\goods_pages\{name}")
            os.mkdir(f"{directory}\goods_pages\{name}\{n[:-5]}")

        links = soup.find_all("header", class_="good-description")
        for li in links:
            ref = url + li.find('a').get('href')
            print(li.text)
            openChrome_headless(ref, f"{directory}\goods_pages\{name}\{n[:-5]}\{li.text}")


if __name__ == "__main__":
    ar2 = []
    for k, v in dict1.items():
        ar2.append(k)
    Pool(processes=6).map(processing_goods_pages, ar2)

您正在创建 6 个进程来处理 14 个 URLs -- 到目前为止一切顺利。但是随后池中的每个进程为了处理 URL 正在为每个 link 启动一个无头 Chrome 浏览器一次,它从文件中读取 URL。我不知道它平均为每个 URL 处理了多少 link 并且我不能说打开和关闭 Chrome 这么多次是最终减速的原因.但在我看来,如果您想要 6 级的多处理级别,那么您永远不必启动超过 6 Chrome 个会话。然而,要实现这一点,需要进行一些代码重构。

我要注意的第一件事是,这项工作可能也可以使用多线程而不是多处理。 BeautifulSouplxml 解析器完成了一些 CPU 密集型工作,但我怀疑这与启动 Chrome 6 次并获取 URL 页面,特别是因为在 URL 获取之后有 1 秒的硬编码等待(稍后会详细介绍)。

想法是在线程本地存储中为多线程池中的每个线程存储当前打开的 Chrome 驱动程序,并且在程序结束之前永远不会 quit 驱动程序。函数 openChrome_headless 中的逻辑现在需要移动到一个新的特殊函数 create_driver 中,该函数可以被 processing_goods_pages 调用以获取当前线程的当前 Chrome 驱动程序(如果当前没有,则创建一个)。但这意味着 openChrome_headlesss 中的 URL-specific 代码现在需要移动到 processing_goods_pages.

最后,线程本地存储被删除,垃圾收集器运行确保classDriver的所有实例的析构函数运行确保所有Chrome 驱动程序实例已“退出”。

由于我无权访问您的文件,这显然无法彻底测试,因此可能存在拼写错误或 10。祝您好运。

进一步注意:不要在 driver.get(ref) 调用之后调用 sleep(1),您应该查看 doin 而不是调用 driver.implicitly_wait(1),然后调用驱动程序定位一个元素,它的存在确保页面上写出所需的所有内容都已加载,如果可能的话。这样,您只需等待 link 出现所需的最短时间。当然,如果 DOM 在初始页面加载后未通过 AJAX 调用进行修改,则根本不需要休眠。

import os
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
# Use multithreading instead of multiprocessing
from multiprocessing.pool import ThreadPool
import threading

# Необходимые переменные
url = "https://eldorado.ua/"
directory = os.path.dirname(os.path.realpath(__file__))
env_path = directory + "\chromedriver"
chromedriver_path = env_path + "\chromedriver.exe"

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.headless = True
        options.add_experimental_option("excludeSwitches", ['enable-automation'])
        options.add_argument(
            '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"')
        self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)

    def __del__(self):
        self.driver.quit() # clean up driver when we are cleaned up
        #print('The driver has been "quitted".')

threadLocal = threading.local()

def create_driver():
    the_driver = getattr(threadLocal, 'the_driver', None)
    if the_driver is None:
        the_driver = Driver()
        setattr(threadLocal, 'the_driver', the_driver)
    return the_driver.driver


dict1 = {"Смартфоны и телефоны": "https://eldorado.ua/node/c1038944/",
         "Телевизоры и аудиотехника": "https://eldorado.ua/node/c1038957/",
         "Ноутбуки, ПК и Планшеты": "https://eldorado.ua/node/c1038958/",
         "Техника для кухни": "https://eldorado.ua/node/c1088594/",
         "Техника для дома": "https://eldorado.ua/node/c1088603/",
         "Игровая зона": "https://eldorado.ua/node/c1285101/",
         "Гаджеты и аксесуары": "https://eldorado.ua/node/c1215257/",
         "Посуда": "https://eldorado.ua/node/c1039055/",
         "Фото и видео": "https://eldorado.ua/node/c1038960/",
         "Красота и здоровье": "https://eldorado.ua/node/c1178596/",
         "Авто и инструменты": "https://eldorado.ua/node/c1284654/",
         "Спорт и туризм": "https://eldorado.ua/node/c1218544/",
         "Товары для дома и сада": "https://eldorado.ua/node/c1285161/",
         "Товары для детей": "https://eldorado.ua/node/c1085100/"}

def processing_goods_pages(name):
    for n in os.listdir(f"brand_pages\{name}"):
        with open(f"{directory}\brand_pages\{name}\{n}", encoding="utf-8") as file:
            soup = BeautifulSoup(file.read(), "lxml")

        if not os.path.exists(f"{directory}\goods_pages\{name}\{n[:-5]}"):
            if not os.path.exists(f"{directory}\goods_pages\{name}"):
                os.mkdir(f"{directory}\goods_pages\{name}")
            os.mkdir(f"{directory}\goods_pages\{name}\{n[:-5]}")

        links = soup.find_all("header", class_="good-description")
        driver = create_driver()
        for li in links:
            ref = url + li.find('a').get('href')
            print(li.text)
            driver.get(ref)
            sleep(1)
            name = f"{directory}\goods_pages\{name}\{n[:-5]}\{li.text}"
            try:
                with open(name + ".html", "w", encoding="utf-8") as file:
                    file.write(driver.page_source)
            except Exception as ex:
                print(ex)

if __name__ == "__main__":
    ThreadPool(processes=6).map(processing_goods_pages, dict1.keys())
    # Quit all the Selenium drivers:
    del threadLocal
    import gc
    gc.collect() # a little extra insurance