使用 asyncio 递归地从网页的子目录中收集链接
Using asyncio recursively gather links from sub-directories in a webpage
我正在尝试编写一个程序来获取网页上的所有链接,甚至是子目录中的链接。我有这个与请求包一起工作,但是当你必须从很多子目录中获取链接时它很慢。这是我的工作代码,需要大约 4 分钟来收集来自 https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/.
的所有链接
import requests
import re
from bs4 import BeautifulSoup
def get_html(base_url):
req = requests.get(base_url)
return req.text if (req.status_code == 200) else ''
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r'(.nc$)|(/$)'
links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r'/$', link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r'.nc$', link)]
return file_links
def main(base_url):
files = []
html_page = get_html(base_url)
links = get_links(html_page)
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.append(base_files)
for sub in sub_dirs:
sub_files = main(sub)
files.append(sub_files)
return files
# Run programe
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = main(base_url)
我认为代码中的瓶颈是 get_html()
函数,需要几秒钟才能取回 html。我认为这段代码可以使用异步函数进行优化,但我正在努力让它发挥作用。这是我对代码的异步版本的尝试:
import aiohttp
import asyncio
import re
from bs4 import BeautifulSoup
async def get_html_async(base_url):
async with aiohttp.ClientSession() as client:
async with client.get(base_url) as resp:
return await resp.text() if (resp.status == 200) else ''
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r'(.nc$)|(/$)'
links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r'/$', link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r'.nc$', link)]
return file_links
async def get_tasks(session):
async with aiohttp.ClientSession() as client:
async with client.get(url) as resp:
return await resp.text() if (resp.status == 200) else ''
async def main(base_url):
files = []
html_page = await asyncio.gather(get_html_async(base_url))
links = get_links(html_page[0])
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.append(base_files)
for sub in sub_dirs:
sub_files = await asyncio.gather(main(sub))
files.append(sub_files)
return files
# Run program
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = asyncio.gather(main(base_url))
如有任何帮助,我们将不胜感激。谢谢!
通过按照您的方式调用 asyncio.gather()
,您可以像以前一样按顺序 运行 处理您的请求。 asyncio.gather()
将多个可迭代对象同时作为 运行 的参数。只用一个 awaitable 调用 asyncio.gather()
是没有意义的,因为那时你可以简单地 await 它。通过在 main()
中创建所有 coros 而无需等待它们,然后将它们全部传递给 asyncio.gather()
,您将获得显着的加速:
# some minor fixes added
import asyncio
import re
from itertools import chain
import aiohttp
from bs4 import BeautifulSoup
async def get_html_async(base_url):
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False) # I got ssl errors on my machine
) as client:
async with client.get(base_url) as resp:
return await resp.text() if (resp.status == 200) else ""
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r"(.nc$)|(/$)"
links = [
f"{base_url}{link.get('href')}"
for link in soup.findAll("a", attrs={"href": re.compile(regex)})
]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r"/$", link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r".nc$", link)]
return file_links
async def main(base_url):
files = []
html_page = await get_html_async(base_url)
links = get_links(html_page) # removed indexing 'html_page[0]'
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.extend(base_files) # extend list to get "cleaner" output, keep using 'append' if your downstream code requires it
coros = [main(sub) for sub in sub_dirs] # create all requests
new_files = await asyncio.gather(*coros) # run all requests concurrently
files.extend(chain(*new_files)) # again, add to list as needed
return files
# Run program
base_url = "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/"
files = asyncio.run(main(base_url)) # or simply 'await main(base_url)' in IPython
print(files)
我正在尝试编写一个程序来获取网页上的所有链接,甚至是子目录中的链接。我有这个与请求包一起工作,但是当你必须从很多子目录中获取链接时它很慢。这是我的工作代码,需要大约 4 分钟来收集来自 https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/.
的所有链接import requests
import re
from bs4 import BeautifulSoup
def get_html(base_url):
req = requests.get(base_url)
return req.text if (req.status_code == 200) else ''
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r'(.nc$)|(/$)'
links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r'/$', link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r'.nc$', link)]
return file_links
def main(base_url):
files = []
html_page = get_html(base_url)
links = get_links(html_page)
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.append(base_files)
for sub in sub_dirs:
sub_files = main(sub)
files.append(sub_files)
return files
# Run programe
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = main(base_url)
我认为代码中的瓶颈是 get_html()
函数,需要几秒钟才能取回 html。我认为这段代码可以使用异步函数进行优化,但我正在努力让它发挥作用。这是我对代码的异步版本的尝试:
import aiohttp
import asyncio
import re
from bs4 import BeautifulSoup
async def get_html_async(base_url):
async with aiohttp.ClientSession() as client:
async with client.get(base_url) as resp:
return await resp.text() if (resp.status == 200) else ''
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r'(.nc$)|(/$)'
links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r'/$', link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r'.nc$', link)]
return file_links
async def get_tasks(session):
async with aiohttp.ClientSession() as client:
async with client.get(url) as resp:
return await resp.text() if (resp.status == 200) else ''
async def main(base_url):
files = []
html_page = await asyncio.gather(get_html_async(base_url))
links = get_links(html_page[0])
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.append(base_files)
for sub in sub_dirs:
sub_files = await asyncio.gather(main(sub))
files.append(sub_files)
return files
# Run program
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = asyncio.gather(main(base_url))
如有任何帮助,我们将不胜感激。谢谢!
通过按照您的方式调用 asyncio.gather()
,您可以像以前一样按顺序 运行 处理您的请求。 asyncio.gather()
将多个可迭代对象同时作为 运行 的参数。只用一个 awaitable 调用 asyncio.gather()
是没有意义的,因为那时你可以简单地 await 它。通过在 main()
中创建所有 coros 而无需等待它们,然后将它们全部传递给 asyncio.gather()
,您将获得显着的加速:
# some minor fixes added
import asyncio
import re
from itertools import chain
import aiohttp
from bs4 import BeautifulSoup
async def get_html_async(base_url):
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False) # I got ssl errors on my machine
) as client:
async with client.get(base_url) as resp:
return await resp.text() if (resp.status == 200) else ""
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r"(.nc$)|(/$)"
links = [
f"{base_url}{link.get('href')}"
for link in soup.findAll("a", attrs={"href": re.compile(regex)})
]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r"/$", link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r".nc$", link)]
return file_links
async def main(base_url):
files = []
html_page = await get_html_async(base_url)
links = get_links(html_page) # removed indexing 'html_page[0]'
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.extend(base_files) # extend list to get "cleaner" output, keep using 'append' if your downstream code requires it
coros = [main(sub) for sub in sub_dirs] # create all requests
new_files = await asyncio.gather(*coros) # run all requests concurrently
files.extend(chain(*new_files)) # again, add to list as needed
return files
# Run program
base_url = "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/"
files = asyncio.run(main(base_url)) # or simply 'await main(base_url)' in IPython
print(files)