为 selenium.webdriver 做 asyncio.to_thread 的正确方法

Proper way of doing asyncio.to_thread for selenium.webdriver

获得几乎为零的性能提升

我想我在这里遗漏了一些东西..

def take_page_scr(dict_item, driver) -> None:
    print(dict_item['id'])
    driver.get(dict_item['url'])
    driver.set_window_size(500, 900)
    (
        Image.open(
            io.BytesIO(
                driver.get_screenshot_as_png()
            )
        )
        .convert("RGB")
        .save(f"./dst/{dict_item['id']}.jpg", quality=85)
    )
    driver.quit()


def main_async(data):
    async def main(data):
        options = get_options()
        await asyncio.gather(
            *(
                asyncio.to_thread(
                    take_page_scr,
                    i, webdriver.Chrome(options=options)
                )
                for i in data
            )
        )
        print()
        print('#DONE')
    asyncio.run(
        main(
            data
        )
    )
    # 13.397236824035645
    # 13.26906943321228

这是基本设置

def main_sync(data):
    options = get_options()
    driver = webdriver.Chrome(options=options)
    for i in data:
        print(i['id'])
        try:
            driver.get(i['url'])
            driver.set_window_size(500, 900)
            image_bytes = io.BytesIO(driver.get_screenshot_as_png())
            img = Image.open(image_bytes).convert("RGB")
            img.save(f"./dst/{i['id']}.jpg", quality=85)
        except Exception:
            pass
    driver.quit()
    print()
    print('#DONE')
    # 16.04508686065674
    # 16.138192653656006

我猜问题在webdriver.Chrome(options=options)

def main_sync_bad():
    options = get_options()
    [
        take_page_scr(
            i, webdriver.Chrome(options=options)
        ) for i in data
    ]
    print()
    print('#DONE')
    # 76.43093585968018
    # 78.09915900230408

但我不知道如何将它传播到多个线程

答案是——不要使用像 lxml 或 selenium 这样设计糟糕的东西,使用文档齐全、设计良好且受支持的库,即使它们来自 micro$oft ...

async def coro(dict_item: TestData, browser: BrowserContext) -> None:
    print(dict_item["hash"])
    page = await browser.new_page()
    await page.goto(dict_item["url"])
    await page.screenshot(path=f'./res/{dict_item["hash"]}.jpg', type="jpeg")


async def main() -> None:
    async with async_playwright() as p:
        browser: BrowserContext = await p.chromium.launch_persistent_context(
            user_data_dir=f"{Path.home()}/.config/chromium",
            executable_path="/usr/bin/chromium",
            viewport={
                "width": 500,
                "height": 900
            },
        )
        await asyncio.gather(*(coro(i, browser) for i in data()))
        await browser.close()
    # 7.495748519897461
    # 2.4119105339050293
    # 2.3972327709198

速度提高 2-3 倍以上!和它一起工作只是纯粹的乐趣..!