如何加速 aiohttp 解析器 bs4?
How can I speed up the aiohttp parser bs4?
任务是从站点获取数据。我有 800 个要请求的 URL。但是需要很长时间。我用的是aiohttp。在这个阶段,我收到了链接,通过点击每个链接,我也得到了一些链接。我应用了 aiohttp,但代码仍然很慢:390.9560036659241 秒。抱歉,如果这是一个简单的问题,但我对 asyncio 的经验很少,所以如果有人可以提供帮助,我们将不胜感激。谢谢
import json
import time
import requests
from bs4 import BeautifulSoup
import datetime
import csv
import asyncio
import aiohttp
iso_data = []
iso_list = []
iso_catalogue = []
iso_links = ''
start_time = time.time()
async def get_page_data(session, url): #get links 256 from main page
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with session.get(url=url) as response:
response_text = await response.text()
soup = BeautifulSoup(response_text, "lxml")
iso_link = soup.find("tbody")
for iso in iso_link.find_all("tr"):
iso_url = iso.find('a').attrs['href']
d = iso.find('a').text
m = iso.find('td', {'data-title': 'Title'}).text
try:
level_2 = (f'{d}{m}').strip()
except:
level_2 = "nothing"
iso_links = f'https://www.iso.org{iso_url}'
iso_list.append(iso_links)
iso_data.append({'level_1': 'tc', 'level_2': level_2})
return iso_list
async def collect_data(): #get 800 links
async with aiohttp.ClientSession() as session:
for i in iso_list:
response = await session.get(url=i)
soup = BeautifulSoup(await response.text(), "lxml")
row = soup.find_all('td', attrs={'data-title': 'Subcommittee'})
if row:
for el in row:
a = el.find('a').attrs['href']
iso_catalogue.append(f'https://www.iso.org{a}')
else:
iso_catalogue.append(iso_links)
return iso_catalogue
async def gather_data():
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with aiohttp.ClientSession() as session:
response = await session.get(url=url)
soup = BeautifulSoup(await response.text(), "lxml")
tasks = []
task = asyncio.create_task(get_page_data(session, url))
tasks.append(task)
await asyncio.gather(*tasks)
async def worker_iso(q):
for urls in out:
while True:
response = await q.get(urls)
soup = BeautifulSoup(await response.text(), "lxml")
for i in soup.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
a1 = i.find('a').attrs['href']
iso_standarts = f'https://www.iso.org{a1}'
iso_standart.append(iso_standarts)
q.task_done()
def main():
asyncio.run(gather_data())
asyncio.run(collect_data())
cur_time = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M")
finish_time = time.time() - start_time
print(f"Spend time: {finish_time}")
if __name__ == "__main__":
main()
``
我根据问题稍微修改了你的例子。现在你从主页面以串行方式打开 256 个链接,所以需要时间。
在我的示例中,我创建了 16 个共享一个队列的工作者(协程)。然后工作人员等待我放入队列的新值并处理请求。
我的电脑在 ~19 秒内打开并处理了 256 个页面:
import tqdm # <-- I use this for nice progress bar/timing
import asyncio
import aiohttp
from bs4 import BeautifulSoup
out = []
async def get_soup(session, url):
async with session.get(url=url) as resp:
return BeautifulSoup(await resp.text(), "lxml")
async def worker(session, q):
while True:
url, link_name, title = await q.get()
soup = await get_soup(session, url)
links = soup.select('[data-title="Subcommittee"] a')
if links:
for a in links:
out.append("https://www.iso.org" + a["href"])
else:
out.append(url)
q.task_done()
async def main():
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with aiohttp.ClientSession() as session:
soup = await get_soup(session, url)
titles = soup.select('td[data-title="Title"]')
links = soup.select('td[data-title="Committee"] a')
committees = []
for a, t in zip(links, titles):
committees.append(
[
"https://www.iso.org" + a["href"],
a.get_text(strip=True),
t.get_text(strip=True),
]
)
queue = asyncio.Queue(maxsize=16)
tasks = []
# create 16 workers that will process data in parallel
for i in range(16):
task = asyncio.create_task(worker(session, queue))
tasks.append(task)
# put some data to worker queue
for c in tqdm.tqdm(committees):
await queue.put(c)
# wait for all data to be processed
await queue.join()
# cancel all worker tasks
for task in tasks:
task.cancel()
# Wait until all worker tasks are cancelled.
await asyncio.gather(*tasks, return_exceptions=True)
print(len(out))
if __name__ == "__main__":
asyncio.run(main())
打印:
100%|██████████████████████████████████████████████████████████████████| 256/256 [00:19<00:00, 13.18it/s]
653
任务是从站点获取数据。我有 800 个要请求的 URL。但是需要很长时间。我用的是aiohttp。在这个阶段,我收到了链接,通过点击每个链接,我也得到了一些链接。我应用了 aiohttp,但代码仍然很慢:390.9560036659241 秒。抱歉,如果这是一个简单的问题,但我对 asyncio 的经验很少,所以如果有人可以提供帮助,我们将不胜感激。谢谢
import json
import time
import requests
from bs4 import BeautifulSoup
import datetime
import csv
import asyncio
import aiohttp
iso_data = []
iso_list = []
iso_catalogue = []
iso_links = ''
start_time = time.time()
async def get_page_data(session, url): #get links 256 from main page
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with session.get(url=url) as response:
response_text = await response.text()
soup = BeautifulSoup(response_text, "lxml")
iso_link = soup.find("tbody")
for iso in iso_link.find_all("tr"):
iso_url = iso.find('a').attrs['href']
d = iso.find('a').text
m = iso.find('td', {'data-title': 'Title'}).text
try:
level_2 = (f'{d}{m}').strip()
except:
level_2 = "nothing"
iso_links = f'https://www.iso.org{iso_url}'
iso_list.append(iso_links)
iso_data.append({'level_1': 'tc', 'level_2': level_2})
return iso_list
async def collect_data(): #get 800 links
async with aiohttp.ClientSession() as session:
for i in iso_list:
response = await session.get(url=i)
soup = BeautifulSoup(await response.text(), "lxml")
row = soup.find_all('td', attrs={'data-title': 'Subcommittee'})
if row:
for el in row:
a = el.find('a').attrs['href']
iso_catalogue.append(f'https://www.iso.org{a}')
else:
iso_catalogue.append(iso_links)
return iso_catalogue
async def gather_data():
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with aiohttp.ClientSession() as session:
response = await session.get(url=url)
soup = BeautifulSoup(await response.text(), "lxml")
tasks = []
task = asyncio.create_task(get_page_data(session, url))
tasks.append(task)
await asyncio.gather(*tasks)
async def worker_iso(q):
for urls in out:
while True:
response = await q.get(urls)
soup = BeautifulSoup(await response.text(), "lxml")
for i in soup.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
a1 = i.find('a').attrs['href']
iso_standarts = f'https://www.iso.org{a1}'
iso_standart.append(iso_standarts)
q.task_done()
def main():
asyncio.run(gather_data())
asyncio.run(collect_data())
cur_time = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M")
finish_time = time.time() - start_time
print(f"Spend time: {finish_time}")
if __name__ == "__main__":
main()
``
我根据问题稍微修改了你的例子。现在你从主页面以串行方式打开 256 个链接,所以需要时间。
在我的示例中,我创建了 16 个共享一个队列的工作者(协程)。然后工作人员等待我放入队列的新值并处理请求。
我的电脑在 ~19 秒内打开并处理了 256 个页面:
import tqdm # <-- I use this for nice progress bar/timing
import asyncio
import aiohttp
from bs4 import BeautifulSoup
out = []
async def get_soup(session, url):
async with session.get(url=url) as resp:
return BeautifulSoup(await resp.text(), "lxml")
async def worker(session, q):
while True:
url, link_name, title = await q.get()
soup = await get_soup(session, url)
links = soup.select('[data-title="Subcommittee"] a')
if links:
for a in links:
out.append("https://www.iso.org" + a["href"])
else:
out.append(url)
q.task_done()
async def main():
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with aiohttp.ClientSession() as session:
soup = await get_soup(session, url)
titles = soup.select('td[data-title="Title"]')
links = soup.select('td[data-title="Committee"] a')
committees = []
for a, t in zip(links, titles):
committees.append(
[
"https://www.iso.org" + a["href"],
a.get_text(strip=True),
t.get_text(strip=True),
]
)
queue = asyncio.Queue(maxsize=16)
tasks = []
# create 16 workers that will process data in parallel
for i in range(16):
task = asyncio.create_task(worker(session, queue))
tasks.append(task)
# put some data to worker queue
for c in tqdm.tqdm(committees):
await queue.put(c)
# wait for all data to be processed
await queue.join()
# cancel all worker tasks
for task in tasks:
task.cancel()
# Wait until all worker tasks are cancelled.
await asyncio.gather(*tasks, return_exceptions=True)
print(len(out))
if __name__ == "__main__":
asyncio.run(main())
打印:
100%|██████████████████████████████████████████████████████████████████| 256/256 [00:19<00:00, 13.18it/s]
653