为 selenium.webdriver 做 asyncio.to_thread 的正确方法
Proper way of doing asyncio.to_thread for selenium.webdriver
获得几乎为零的性能提升
我想我在这里遗漏了一些东西..
def take_page_scr(dict_item, driver) -> None:
print(dict_item['id'])
driver.get(dict_item['url'])
driver.set_window_size(500, 900)
(
Image.open(
io.BytesIO(
driver.get_screenshot_as_png()
)
)
.convert("RGB")
.save(f"./dst/{dict_item['id']}.jpg", quality=85)
)
driver.quit()
def main_async(data):
async def main(data):
options = get_options()
await asyncio.gather(
*(
asyncio.to_thread(
take_page_scr,
i, webdriver.Chrome(options=options)
)
for i in data
)
)
print()
print('#DONE')
asyncio.run(
main(
data
)
)
# 13.397236824035645
# 13.26906943321228
这是基本设置
def main_sync(data):
options = get_options()
driver = webdriver.Chrome(options=options)
for i in data:
print(i['id'])
try:
driver.get(i['url'])
driver.set_window_size(500, 900)
image_bytes = io.BytesIO(driver.get_screenshot_as_png())
img = Image.open(image_bytes).convert("RGB")
img.save(f"./dst/{i['id']}.jpg", quality=85)
except Exception:
pass
driver.quit()
print()
print('#DONE')
# 16.04508686065674
# 16.138192653656006
我猜问题在webdriver.Chrome(options=options)
def main_sync_bad():
options = get_options()
[
take_page_scr(
i, webdriver.Chrome(options=options)
) for i in data
]
print()
print('#DONE')
# 76.43093585968018
# 78.09915900230408
但我不知道如何将它传播到多个线程
答案是——不要使用像 lxml 或 selenium 这样设计糟糕的东西,使用文档齐全、设计良好且受支持的库,即使它们来自 micro$oft ...
async def coro(dict_item: TestData, browser: BrowserContext) -> None:
print(dict_item["hash"])
page = await browser.new_page()
await page.goto(dict_item["url"])
await page.screenshot(path=f'./res/{dict_item["hash"]}.jpg', type="jpeg")
async def main() -> None:
async with async_playwright() as p:
browser: BrowserContext = await p.chromium.launch_persistent_context(
user_data_dir=f"{Path.home()}/.config/chromium",
executable_path="/usr/bin/chromium",
viewport={
"width": 500,
"height": 900
},
)
await asyncio.gather(*(coro(i, browser) for i in data()))
await browser.close()
# 7.495748519897461
# 2.4119105339050293
# 2.3972327709198
速度提高 2-3 倍以上!和它一起工作只是纯粹的乐趣..!
获得几乎为零的性能提升
我想我在这里遗漏了一些东西..
def take_page_scr(dict_item, driver) -> None:
print(dict_item['id'])
driver.get(dict_item['url'])
driver.set_window_size(500, 900)
(
Image.open(
io.BytesIO(
driver.get_screenshot_as_png()
)
)
.convert("RGB")
.save(f"./dst/{dict_item['id']}.jpg", quality=85)
)
driver.quit()
def main_async(data):
async def main(data):
options = get_options()
await asyncio.gather(
*(
asyncio.to_thread(
take_page_scr,
i, webdriver.Chrome(options=options)
)
for i in data
)
)
print()
print('#DONE')
asyncio.run(
main(
data
)
)
# 13.397236824035645
# 13.26906943321228
这是基本设置
def main_sync(data):
options = get_options()
driver = webdriver.Chrome(options=options)
for i in data:
print(i['id'])
try:
driver.get(i['url'])
driver.set_window_size(500, 900)
image_bytes = io.BytesIO(driver.get_screenshot_as_png())
img = Image.open(image_bytes).convert("RGB")
img.save(f"./dst/{i['id']}.jpg", quality=85)
except Exception:
pass
driver.quit()
print()
print('#DONE')
# 16.04508686065674
# 16.138192653656006
我猜问题在webdriver.Chrome(options=options)
def main_sync_bad():
options = get_options()
[
take_page_scr(
i, webdriver.Chrome(options=options)
) for i in data
]
print()
print('#DONE')
# 76.43093585968018
# 78.09915900230408
但我不知道如何将它传播到多个线程
答案是——不要使用像 lxml 或 selenium 这样设计糟糕的东西,使用文档齐全、设计良好且受支持的库,即使它们来自 micro$oft ...
async def coro(dict_item: TestData, browser: BrowserContext) -> None:
print(dict_item["hash"])
page = await browser.new_page()
await page.goto(dict_item["url"])
await page.screenshot(path=f'./res/{dict_item["hash"]}.jpg', type="jpeg")
async def main() -> None:
async with async_playwright() as p:
browser: BrowserContext = await p.chromium.launch_persistent_context(
user_data_dir=f"{Path.home()}/.config/chromium",
executable_path="/usr/bin/chromium",
viewport={
"width": 500,
"height": 900
},
)
await asyncio.gather(*(coro(i, browser) for i in data()))
await browser.close()
# 7.495748519897461
# 2.4119105339050293
# 2.3972327709198
速度提高 2-3 倍以上!和它一起工作只是纯粹的乐趣..!