在连续抓取模式下使用 pyppeteer
using pyppeteer in a continuous scraping mode
每个示例和用例都使用 pyppeteer,其中浏览器会立即打开和关闭。例如
导入异步
从 pyppeteer 导入启动
async def main():
browser = await launch()
page = await browser.newPage()
await page.goto('http://someurl')
content = await page.content()
cookieslist=await page.cookies()
cookiejar=createCookieJar(cookieslist)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
如果你想让浏览器保持打开状态,并不断地抓取数据会怎样?这很容易用 selenium 完成,但是对于 pyppeteer,它似乎没有 asyncio 就无法工作。使其工作的另一种方法是保存会话并按计划重新打开浏览器和 scrape,但感觉这是一种非常低效的方法。有人试过吗?
您可以使用 asyncio.Queue 并不断将数据泵入队列:
import asyncio
import traceback
from contextlib import suppress
from pyppeteer import launch
WORKERS = 10
URLS = [
"http://airbnb.com",
"http://amazon.co.uk",
"http://amazon.com",
"http://bing.com",
"http://djangoproject.com",
"http://envato.com",
"http://facebook.com",
"http://github.com",
"http://google.co.uk",
"http://google.com",
"http://google.es",
"http://google.fr",
"http://heroku.com",
"http://instagram.com",
"http://linkedin.com",
"http://live.com",
"http://netflix.com",
"http://rubyonrails.org",
"http://shopify.com",
"http://whosebug.com",
"http://trello.com",
"http://wordpress.com",
"http://yahoo.com",
"http://yandex.ru",
"http://yiiframework.com",
"http://youtube.com",
]
async def worker(q, browser):
# One tab per worker
page = await browser.newPage()
with suppress(asyncio.CancelledError):
while True:
url = await q.get()
try:
await page.goto(url, {"timeout": 10000})
html = await page.content()
except Exception:
traceback.print_exc()
else:
print(f"{url}: {len(html)}")
finally:
q.task_done()
await page.close()
async def main():
q = asyncio.Queue()
browser = await launch(headless=True, args=["--no-sandbox"])
tasks = []
for _ in range(WORKERS):
tasks.append(asyncio.create_task(worker(q, browser)))
for url in URLS:
await q.put(url)
await q.join()
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
每个示例和用例都使用 pyppeteer,其中浏览器会立即打开和关闭。例如 导入异步 从 pyppeteer 导入启动
async def main():
browser = await launch()
page = await browser.newPage()
await page.goto('http://someurl')
content = await page.content()
cookieslist=await page.cookies()
cookiejar=createCookieJar(cookieslist)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
如果你想让浏览器保持打开状态,并不断地抓取数据会怎样?这很容易用 selenium 完成,但是对于 pyppeteer,它似乎没有 asyncio 就无法工作。使其工作的另一种方法是保存会话并按计划重新打开浏览器和 scrape,但感觉这是一种非常低效的方法。有人试过吗?
您可以使用 asyncio.Queue 并不断将数据泵入队列:
import asyncio
import traceback
from contextlib import suppress
from pyppeteer import launch
WORKERS = 10
URLS = [
"http://airbnb.com",
"http://amazon.co.uk",
"http://amazon.com",
"http://bing.com",
"http://djangoproject.com",
"http://envato.com",
"http://facebook.com",
"http://github.com",
"http://google.co.uk",
"http://google.com",
"http://google.es",
"http://google.fr",
"http://heroku.com",
"http://instagram.com",
"http://linkedin.com",
"http://live.com",
"http://netflix.com",
"http://rubyonrails.org",
"http://shopify.com",
"http://whosebug.com",
"http://trello.com",
"http://wordpress.com",
"http://yahoo.com",
"http://yandex.ru",
"http://yiiframework.com",
"http://youtube.com",
]
async def worker(q, browser):
# One tab per worker
page = await browser.newPage()
with suppress(asyncio.CancelledError):
while True:
url = await q.get()
try:
await page.goto(url, {"timeout": 10000})
html = await page.content()
except Exception:
traceback.print_exc()
else:
print(f"{url}: {len(html)}")
finally:
q.task_done()
await page.close()
async def main():
q = asyncio.Queue()
browser = await launch(headless=True, args=["--no-sandbox"])
tasks = []
for _ in range(WORKERS):
tasks.append(asyncio.create_task(worker(q, browser)))
for url in URLS:
await q.put(url)
await q.join()
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
await browser.close()
if __name__ == "__main__":
asyncio.run(main())