python 快速循环链接
python looping fast through links
import requests
import json
from tqdm import tqdm
要循环的 link 列表
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
for 循环 link 使用请求
data = []
for link in tqdm(range(len(links))):
response = requests.get(links[link])
response = response.json()
data.append(response)
上面的 for 循环用于遍历所有 links 的列表,但是当我试图循环大约一千 links 时它很耗时任何帮助。
根据评论中的建议,您可以使用 asyncio 和 aiohttp.
import asyncio
import aiohttp
urls = ["your", "links", "here"]
# create aio connector
conn = aiohttp.TCPConnector(limit_per_host=100, limit=0, ttl_dns_cache=300)
# set number of parallel requests - if you are requesting different domains you are likely to be able to set this higher, otherwise you may be rate limited
PARALLEL_REQUESTS = 10
# Create results array to collect results
results = []
async def gather_with_concurrency(n):
# Create semaphore for async i/o
semaphore = asyncio.Semaphore(n)
# create an aiohttp session using the previous connector
session = aiohttp.ClientSession(connector=conn)
# await logic for get request
async def get(URL):
async with semaphore:
async with session.get(url, ssl=False) as response:
obj = await response.read()
# once object is acquired we append to list
results.append(obj)
# wait for all requests to be gathered and then close session
await asyncio.gather(*(get(url) for url in urls))
await session.close()
# get async event loop
loop = asyncio.get_event_loop()
# run using number of parallel requests
loop.run_until_complete(gather_with_concurrency(PARALLEL_REQUESTS))
# Close connection
conn.close()
# loop through results and do something to them
for res in results:
do_something(res)
我已尽力对代码进行评论。
我已经使用 BS4 以这种方式解析请求(在 do_something
逻辑中),但这实际上取决于您的用例。
最简单的方法是将其变为多线程。最好的方法可能是异步的。
多线程解决方案:
import requests
from tqdm.contrib.concurrent import thread_map
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
def get_data(url):
response = requests.get(url)
response = response.json() # Do note this might fail at times
return response
data = thread_map(get_data, links)
或不使用 tqdm.contrib.concurrent.thread_map
:
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
def get_data(url):
response = requests.get(url)
response = response.json() # Do note this might fail at times
return response
executor = ThreadPoolExecutor()
data = list(tqdm(executor.map(get_data, links), total=len(links)))
import requests
import json
from tqdm import tqdm
要循环的 link 列表
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
for 循环 link 使用请求
data = []
for link in tqdm(range(len(links))):
response = requests.get(links[link])
response = response.json()
data.append(response)
上面的 for 循环用于遍历所有 links 的列表,但是当我试图循环大约一千 links 时它很耗时任何帮助。
根据评论中的建议,您可以使用 asyncio 和 aiohttp.
import asyncio
import aiohttp
urls = ["your", "links", "here"]
# create aio connector
conn = aiohttp.TCPConnector(limit_per_host=100, limit=0, ttl_dns_cache=300)
# set number of parallel requests - if you are requesting different domains you are likely to be able to set this higher, otherwise you may be rate limited
PARALLEL_REQUESTS = 10
# Create results array to collect results
results = []
async def gather_with_concurrency(n):
# Create semaphore for async i/o
semaphore = asyncio.Semaphore(n)
# create an aiohttp session using the previous connector
session = aiohttp.ClientSession(connector=conn)
# await logic for get request
async def get(URL):
async with semaphore:
async with session.get(url, ssl=False) as response:
obj = await response.read()
# once object is acquired we append to list
results.append(obj)
# wait for all requests to be gathered and then close session
await asyncio.gather(*(get(url) for url in urls))
await session.close()
# get async event loop
loop = asyncio.get_event_loop()
# run using number of parallel requests
loop.run_until_complete(gather_with_concurrency(PARALLEL_REQUESTS))
# Close connection
conn.close()
# loop through results and do something to them
for res in results:
do_something(res)
我已尽力对代码进行评论。
我已经使用 BS4 以这种方式解析请求(在 do_something
逻辑中),但这实际上取决于您的用例。
最简单的方法是将其变为多线程。最好的方法可能是异步的。
多线程解决方案:
import requests
from tqdm.contrib.concurrent import thread_map
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
def get_data(url):
response = requests.get(url)
response = response.json() # Do note this might fail at times
return response
data = thread_map(get_data, links)
或不使用 tqdm.contrib.concurrent.thread_map
:
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
def get_data(url):
response = requests.get(url)
response = response.json() # Do note this might fail at times
return response
executor = ThreadPoolExecutor()
data = list(tqdm(executor.map(get_data, links), total=len(links)))