在 Python Asyncio 中限制异步函数
Throttling Async Functions in Python Asyncio
我想将 awaitables
的 list
传递给 asyncio.AbstractEventLoop
,但我需要限制对第三方的请求 API。
我想避免等待将 future
传递给循环的东西,因为与此同时我阻止了我的循环等待。我有什么选择? Semaphores
和 ThreadPools
将同时限制 运行 的数量,但这不是我的问题。我需要将我的请求限制为 100/秒,但完成请求需要多长时间并不重要。
这是一个使用标准库的非常简洁的(非)工作示例,它演示了问题。这应该以 100/秒的速度节流,但以 116.651/秒的速度节流。 在 asyncio 中限制异步请求调度的最佳方法是什么?
工作代码:
import asyncio
from threading import Lock
class PTBNL:
def __init__(self):
self._req_id_seq = 0
self._futures = {}
self._results = {}
self.token_bucket = TokenBucket()
self.token_bucket.set_rate(100)
def run(self, *awaitables):
loop = asyncio.get_event_loop()
if not awaitables:
loop.run_forever()
elif len(awaitables) == 1:
return loop.run_until_complete(*awaitables)
else:
future = asyncio.gather(*awaitables)
return loop.run_until_complete(future)
def sleep(self, secs) -> True:
self.run(asyncio.sleep(secs))
return True
def get_req_id(self) -> int:
new_id = self._req_id_seq
self._req_id_seq += 1
return new_id
def start_req(self, key):
loop = asyncio.get_event_loop()
future = loop.create_future()
self._futures[key] = future
return future
def end_req(self, key, result=None):
future = self._futures.pop(key, None)
if future:
if result is None:
result = self._results.pop(key, [])
if not future.done():
future.set_result(result)
def req_data(self, req_id, obj):
# Do Some Work Here
self.req_data_end(req_id)
pass
def req_data_end(self, req_id):
print(req_id, " has ended")
self.end_req(req_id)
async def req_data_async(self, obj):
req_id = self.get_req_id()
future = self.start_req(req_id)
self.req_data(req_id, obj)
await future
return future.result()
async def req_data_batch_async(self, contracts):
futures = []
FLAG = False
for contract in contracts:
req_id = self.get_req_id()
future = self.start_req(req_id)
futures.append(future)
nap = self.token_bucket.consume(1)
if FLAG is False:
FLAG = True
start = asyncio.get_event_loop().time()
asyncio.get_event_loop().call_later(nap, self.req_data, req_id, contract)
await asyncio.gather(*futures)
elapsed = asyncio.get_event_loop().time() - start
return futures, len(contracts)/elapsed
class TokenBucket:
def __init__(self):
self.tokens = 0
self.rate = 0
self.last = asyncio.get_event_loop().time()
self.lock = Lock()
def set_rate(self, rate):
with self.lock:
self.rate = rate
self.tokens = self.rate
def consume(self, tokens):
with self.lock:
if not self.rate:
return 0
now = asyncio.get_event_loop().time()
lapse = now - self.last
self.last = now
self.tokens += lapse * self.rate
if self.tokens > self.rate:
self.tokens = self.rate
self.tokens -= tokens
if self.tokens >= 0:
return 0
else:
return -self.tokens / self.rate
if __name__ == '__main__':
asyncio.get_event_loop().set_debug(True)
app = PTBNL()
objs = [obj for obj in range(500)]
l,t = app.run(app.req_data_batch_async(objs))
print(l)
print(t)
编辑:我在此处使用信号量添加了一个 TrottleTestApp
的简单示例,但仍然无法限制执行:
import asyncio
import time
class ThrottleTestApp:
def __init__(self):
self._req_id_seq = 0
self._futures = {}
self._results = {}
self.sem = asyncio.Semaphore()
async def allow_requests(self, sem):
"""Permit 100 requests per second; call
loop.create_task(allow_requests())
at the beginning of the program to start this routine. That call returns
a task handle that can be canceled to end this routine.
asyncio.Semaphore doesn't give us a great way to get at the value other
than accessing sem._value. We do that here, but creating a wrapper that
adds a current_value method would make this cleaner"""
while True:
while sem._value < 100: sem.release()
await asyncio.sleep(1) # Or spread more evenly
# with a shorter sleep and
# increasing the value less
async def do_request(self, req_id, obj):
await self.sem.acquire()
# this is the work for the request
self.req_data(req_id, obj)
def run(self, *awaitables):
loop = asyncio.get_event_loop()
if not awaitables:
loop.run_forever()
elif len(awaitables) == 1:
return loop.run_until_complete(*awaitables)
else:
future = asyncio.gather(*awaitables)
return loop.run_until_complete(future)
def sleep(self, secs: [float]=0.02) -> True:
self.run(asyncio.sleep(secs))
return True
def get_req_id(self) -> int:
new_id = self._req_id_seq
self._req_id_seq += 1
return new_id
def start_req(self, key):
loop = asyncio.get_event_loop()
future = loop.create_future()
self._futures[key] = future
return future
def end_req(self, key, result=None):
future = self._futures.pop(key, None)
if future:
if result is None:
result = self._results.pop(key, [])
if not future.done():
future.set_result(result)
def req_data(self, req_id, obj):
# This is the method that "does" something
self.req_data_end(req_id)
pass
def req_data_end(self, req_id):
print(req_id, " has ended")
self.end_req(req_id)
async def req_data_batch_async(self, objs):
futures = []
FLAG = False
for obj in objs:
req_id = self.get_req_id()
future = self.start_req(req_id)
futures.append(future)
if FLAG is False:
FLAG = True
start = time.time()
self.do_request(req_id, obj)
await asyncio.gather(*futures)
elapsed = time.time() - start
print("Roughly %s per second" % (len(objs)/elapsed))
return futures
if __name__ == '__main__':
asyncio.get_event_loop().set_debug(True)
app = ThrottleTestApp()
objs = [obj for obj in range(10000)]
app.run(app.req_data_batch_async(objs))
您可以通过实施 leaky bucket algorithm:
import asyncio
import contextlib
import collections
import time
from types import TracebackType
from typing import Dict, Optional, Type
try: # Python 3.7
base = contextlib.AbstractAsyncContextManager
_current_task = asyncio.current_task
except AttributeError:
base = object # type: ignore
_current_task = asyncio.Task.current_task # type: ignore
class AsyncLeakyBucket(base):
"""A leaky bucket rate limiter.
Allows up to max_rate / time_period acquisitions before blocking.
time_period is measured in seconds; the default is 60.
"""
def __init__(
self,
max_rate: float,
time_period: float = 60,
loop: Optional[asyncio.AbstractEventLoop] = None
) -> None:
self._loop = loop
self._max_level = max_rate
self._rate_per_sec = max_rate / time_period
self._level = 0.0
self._last_check = 0.0
# queue of waiting futures to signal capacity to
self._waiters: Dict[asyncio.Task, asyncio.Future] = collections.OrderedDict()
def _leak(self) -> None:
"""Drip out capacity from the bucket."""
if self._level:
# drip out enough level for the elapsed time since
# we last checked
elapsed = time.time() - self._last_check
decrement = elapsed * self._rate_per_sec
self._level = max(self._level - decrement, 0)
self._last_check = time.time()
def has_capacity(self, amount: float = 1) -> bool:
"""Check if there is enough space remaining in the bucket"""
self._leak()
requested = self._level + amount
# if there are tasks waiting for capacity, signal to the first
# there there may be some now (they won't wake up until this task
# yields with an await)
if requested < self._max_level:
for fut in self._waiters.values():
if not fut.done():
fut.set_result(True)
break
return self._level + amount <= self._max_level
async def acquire(self, amount: float = 1) -> None:
"""Acquire space in the bucket.
If the bucket is full, block until there is space.
"""
if amount > self._max_level:
raise ValueError("Can't acquire more than the bucket capacity")
loop = self._loop or asyncio.get_event_loop()
task = _current_task(loop)
assert task is not None
while not self.has_capacity(amount):
# wait for the next drip to have left the bucket
# add a future to the _waiters map to be notified
# 'early' if capacity has come up
fut = loop.create_future()
self._waiters[task] = fut
try:
await asyncio.wait_for(
asyncio.shield(fut),
1 / self._rate_per_sec * amount,
loop=loop
)
except asyncio.TimeoutError:
pass
fut.cancel()
self._waiters.pop(task, None)
self._level += amount
return None
async def __aenter__(self) -> None:
await self.acquire()
return None
async def __aexit__(
self,
exc_type: Optional[Type[BaseException]],
exc: Optional[BaseException],
tb: Optional[TracebackType]
) -> None:
return None
请注意,我们从存储桶中机会性地泄漏容量,不需要 运行 单独的异步任务只是为了降低级别;相反,当测试剩余容量是否足够时,容量会泄漏。
请注意,等待容量的任务保存在有序字典中,当可能再次有空闲容量时,第一个仍在等待的任务会提前唤醒。
您可以将其用作上下文管理器;尝试在存储桶满块时获取存储桶,直到再次释放足够的容量:
bucket = AsyncLeakyBucket(100)
# ...
async with bucket:
# only reached once the bucket is no longer full
或者您可以直接拨打acquire()
:
await bucket.acquire() # blocks until there is space in the bucket
或者你可以简单地先测试是否有space:
if bucket.has_capacity():
# reject a request due to rate limiting
请注意,您可以通过增加或减少 'drip' 放入存储桶中的数量来将某些请求计为 'heavier' 或 'lighter':
await bucket.acquire(10)
if bucket.has_capacity(0.5):
尽管如此,请务必小心;当混合大滴和小滴时,当达到或接近最大速率时,小滴往往比大滴先得到 运行,因为在 space 更大的。
演示:
>>> import asyncio, time
>>> bucket = AsyncLeakyBucket(5, 10)
>>> async def task(id):
... await asyncio.sleep(id * 0.01)
... async with bucket:
... print(f'{id:>2d}: Drip! {time.time() - ref:>5.2f}')
...
>>> ref = time.time()
>>> tasks = [task(i) for i in range(15)]
>>> result = asyncio.run(asyncio.wait(tasks))
0: Drip! 0.00
1: Drip! 0.02
2: Drip! 0.02
3: Drip! 0.03
4: Drip! 0.04
5: Drip! 2.05
6: Drip! 4.06
7: Drip! 6.06
8: Drip! 8.06
9: Drip! 10.07
10: Drip! 12.07
11: Drip! 14.08
12: Drip! 16.08
13: Drip! 18.08
14: Drip! 20.09
开始时桶很快被填满,导致其余任务分散得更均匀;每 2 秒就会释放足够的容量来处理另一项任务。
最大突发大小等于最大速率值,在上面的演示中设置为5。如果您不想允许突发,请将最大速率设置为1,并将时间段设置为最小值滴水间隔时间:
>>> bucket = AsyncLeakyBucket(1, 1.5) # no bursts, drip every 1.5 seconds
>>> async def task():
... async with bucket:
... print(f'Drip! {time.time() - ref:>5.2f}')
...
>>> ref = time.time()
>>> tasks = [task() for _ in range(5)]
>>> result = asyncio.run(asyncio.wait(tasks))
Drip! 0.00
Drip! 1.50
Drip! 3.01
Drip! 4.51
Drip! 6.02
我已经开始将其打包为一个 Python 项目:https://github.com/mjpieters/aiolimiter
另一种解决方案 - 使用有界信号量 - 由同事、导师和朋友提出,如下:
import asyncio
class AsyncLeakyBucket(object):
def __init__(self, max_tasks: float, time_period: float = 60, loop: asyncio.events=None):
self._delay_time = time_period / max_tasks
self._sem = asyncio.BoundedSemaphore(max_tasks)
self._loop = loop or asyncio.get_event_loop()
self._loop.create_task(self._leak_sem())
async def _leak_sem(self):
"""
Background task that leaks semaphore releases based on the desired rate of tasks per time_period
"""
while True:
await asyncio.sleep(self._delay_time)
try:
self._sem.release()
except ValueError:
pass
async def __aenter__(self) -> None:
await self._sem.acquire()
async def __aexit__(self, exc_type, exc, tb) -> None:
pass
仍然可以使用与@Martijn 的回答
中相同的async with bucket
代码
我想将 awaitables
的 list
传递给 asyncio.AbstractEventLoop
,但我需要限制对第三方的请求 API。
我想避免等待将 future
传递给循环的东西,因为与此同时我阻止了我的循环等待。我有什么选择? Semaphores
和 ThreadPools
将同时限制 运行 的数量,但这不是我的问题。我需要将我的请求限制为 100/秒,但完成请求需要多长时间并不重要。
这是一个使用标准库的非常简洁的(非)工作示例,它演示了问题。这应该以 100/秒的速度节流,但以 116.651/秒的速度节流。 在 asyncio 中限制异步请求调度的最佳方法是什么?
工作代码:
import asyncio
from threading import Lock
class PTBNL:
def __init__(self):
self._req_id_seq = 0
self._futures = {}
self._results = {}
self.token_bucket = TokenBucket()
self.token_bucket.set_rate(100)
def run(self, *awaitables):
loop = asyncio.get_event_loop()
if not awaitables:
loop.run_forever()
elif len(awaitables) == 1:
return loop.run_until_complete(*awaitables)
else:
future = asyncio.gather(*awaitables)
return loop.run_until_complete(future)
def sleep(self, secs) -> True:
self.run(asyncio.sleep(secs))
return True
def get_req_id(self) -> int:
new_id = self._req_id_seq
self._req_id_seq += 1
return new_id
def start_req(self, key):
loop = asyncio.get_event_loop()
future = loop.create_future()
self._futures[key] = future
return future
def end_req(self, key, result=None):
future = self._futures.pop(key, None)
if future:
if result is None:
result = self._results.pop(key, [])
if not future.done():
future.set_result(result)
def req_data(self, req_id, obj):
# Do Some Work Here
self.req_data_end(req_id)
pass
def req_data_end(self, req_id):
print(req_id, " has ended")
self.end_req(req_id)
async def req_data_async(self, obj):
req_id = self.get_req_id()
future = self.start_req(req_id)
self.req_data(req_id, obj)
await future
return future.result()
async def req_data_batch_async(self, contracts):
futures = []
FLAG = False
for contract in contracts:
req_id = self.get_req_id()
future = self.start_req(req_id)
futures.append(future)
nap = self.token_bucket.consume(1)
if FLAG is False:
FLAG = True
start = asyncio.get_event_loop().time()
asyncio.get_event_loop().call_later(nap, self.req_data, req_id, contract)
await asyncio.gather(*futures)
elapsed = asyncio.get_event_loop().time() - start
return futures, len(contracts)/elapsed
class TokenBucket:
def __init__(self):
self.tokens = 0
self.rate = 0
self.last = asyncio.get_event_loop().time()
self.lock = Lock()
def set_rate(self, rate):
with self.lock:
self.rate = rate
self.tokens = self.rate
def consume(self, tokens):
with self.lock:
if not self.rate:
return 0
now = asyncio.get_event_loop().time()
lapse = now - self.last
self.last = now
self.tokens += lapse * self.rate
if self.tokens > self.rate:
self.tokens = self.rate
self.tokens -= tokens
if self.tokens >= 0:
return 0
else:
return -self.tokens / self.rate
if __name__ == '__main__':
asyncio.get_event_loop().set_debug(True)
app = PTBNL()
objs = [obj for obj in range(500)]
l,t = app.run(app.req_data_batch_async(objs))
print(l)
print(t)
编辑:我在此处使用信号量添加了一个 TrottleTestApp
的简单示例,但仍然无法限制执行:
import asyncio
import time
class ThrottleTestApp:
def __init__(self):
self._req_id_seq = 0
self._futures = {}
self._results = {}
self.sem = asyncio.Semaphore()
async def allow_requests(self, sem):
"""Permit 100 requests per second; call
loop.create_task(allow_requests())
at the beginning of the program to start this routine. That call returns
a task handle that can be canceled to end this routine.
asyncio.Semaphore doesn't give us a great way to get at the value other
than accessing sem._value. We do that here, but creating a wrapper that
adds a current_value method would make this cleaner"""
while True:
while sem._value < 100: sem.release()
await asyncio.sleep(1) # Or spread more evenly
# with a shorter sleep and
# increasing the value less
async def do_request(self, req_id, obj):
await self.sem.acquire()
# this is the work for the request
self.req_data(req_id, obj)
def run(self, *awaitables):
loop = asyncio.get_event_loop()
if not awaitables:
loop.run_forever()
elif len(awaitables) == 1:
return loop.run_until_complete(*awaitables)
else:
future = asyncio.gather(*awaitables)
return loop.run_until_complete(future)
def sleep(self, secs: [float]=0.02) -> True:
self.run(asyncio.sleep(secs))
return True
def get_req_id(self) -> int:
new_id = self._req_id_seq
self._req_id_seq += 1
return new_id
def start_req(self, key):
loop = asyncio.get_event_loop()
future = loop.create_future()
self._futures[key] = future
return future
def end_req(self, key, result=None):
future = self._futures.pop(key, None)
if future:
if result is None:
result = self._results.pop(key, [])
if not future.done():
future.set_result(result)
def req_data(self, req_id, obj):
# This is the method that "does" something
self.req_data_end(req_id)
pass
def req_data_end(self, req_id):
print(req_id, " has ended")
self.end_req(req_id)
async def req_data_batch_async(self, objs):
futures = []
FLAG = False
for obj in objs:
req_id = self.get_req_id()
future = self.start_req(req_id)
futures.append(future)
if FLAG is False:
FLAG = True
start = time.time()
self.do_request(req_id, obj)
await asyncio.gather(*futures)
elapsed = time.time() - start
print("Roughly %s per second" % (len(objs)/elapsed))
return futures
if __name__ == '__main__':
asyncio.get_event_loop().set_debug(True)
app = ThrottleTestApp()
objs = [obj for obj in range(10000)]
app.run(app.req_data_batch_async(objs))
您可以通过实施 leaky bucket algorithm:
import asyncio
import contextlib
import collections
import time
from types import TracebackType
from typing import Dict, Optional, Type
try: # Python 3.7
base = contextlib.AbstractAsyncContextManager
_current_task = asyncio.current_task
except AttributeError:
base = object # type: ignore
_current_task = asyncio.Task.current_task # type: ignore
class AsyncLeakyBucket(base):
"""A leaky bucket rate limiter.
Allows up to max_rate / time_period acquisitions before blocking.
time_period is measured in seconds; the default is 60.
"""
def __init__(
self,
max_rate: float,
time_period: float = 60,
loop: Optional[asyncio.AbstractEventLoop] = None
) -> None:
self._loop = loop
self._max_level = max_rate
self._rate_per_sec = max_rate / time_period
self._level = 0.0
self._last_check = 0.0
# queue of waiting futures to signal capacity to
self._waiters: Dict[asyncio.Task, asyncio.Future] = collections.OrderedDict()
def _leak(self) -> None:
"""Drip out capacity from the bucket."""
if self._level:
# drip out enough level for the elapsed time since
# we last checked
elapsed = time.time() - self._last_check
decrement = elapsed * self._rate_per_sec
self._level = max(self._level - decrement, 0)
self._last_check = time.time()
def has_capacity(self, amount: float = 1) -> bool:
"""Check if there is enough space remaining in the bucket"""
self._leak()
requested = self._level + amount
# if there are tasks waiting for capacity, signal to the first
# there there may be some now (they won't wake up until this task
# yields with an await)
if requested < self._max_level:
for fut in self._waiters.values():
if not fut.done():
fut.set_result(True)
break
return self._level + amount <= self._max_level
async def acquire(self, amount: float = 1) -> None:
"""Acquire space in the bucket.
If the bucket is full, block until there is space.
"""
if amount > self._max_level:
raise ValueError("Can't acquire more than the bucket capacity")
loop = self._loop or asyncio.get_event_loop()
task = _current_task(loop)
assert task is not None
while not self.has_capacity(amount):
# wait for the next drip to have left the bucket
# add a future to the _waiters map to be notified
# 'early' if capacity has come up
fut = loop.create_future()
self._waiters[task] = fut
try:
await asyncio.wait_for(
asyncio.shield(fut),
1 / self._rate_per_sec * amount,
loop=loop
)
except asyncio.TimeoutError:
pass
fut.cancel()
self._waiters.pop(task, None)
self._level += amount
return None
async def __aenter__(self) -> None:
await self.acquire()
return None
async def __aexit__(
self,
exc_type: Optional[Type[BaseException]],
exc: Optional[BaseException],
tb: Optional[TracebackType]
) -> None:
return None
请注意,我们从存储桶中机会性地泄漏容量,不需要 运行 单独的异步任务只是为了降低级别;相反,当测试剩余容量是否足够时,容量会泄漏。
请注意,等待容量的任务保存在有序字典中,当可能再次有空闲容量时,第一个仍在等待的任务会提前唤醒。
您可以将其用作上下文管理器;尝试在存储桶满块时获取存储桶,直到再次释放足够的容量:
bucket = AsyncLeakyBucket(100)
# ...
async with bucket:
# only reached once the bucket is no longer full
或者您可以直接拨打acquire()
:
await bucket.acquire() # blocks until there is space in the bucket
或者你可以简单地先测试是否有space:
if bucket.has_capacity():
# reject a request due to rate limiting
请注意,您可以通过增加或减少 'drip' 放入存储桶中的数量来将某些请求计为 'heavier' 或 'lighter':
await bucket.acquire(10)
if bucket.has_capacity(0.5):
尽管如此,请务必小心;当混合大滴和小滴时,当达到或接近最大速率时,小滴往往比大滴先得到 运行,因为在 space 更大的。
演示:
>>> import asyncio, time
>>> bucket = AsyncLeakyBucket(5, 10)
>>> async def task(id):
... await asyncio.sleep(id * 0.01)
... async with bucket:
... print(f'{id:>2d}: Drip! {time.time() - ref:>5.2f}')
...
>>> ref = time.time()
>>> tasks = [task(i) for i in range(15)]
>>> result = asyncio.run(asyncio.wait(tasks))
0: Drip! 0.00
1: Drip! 0.02
2: Drip! 0.02
3: Drip! 0.03
4: Drip! 0.04
5: Drip! 2.05
6: Drip! 4.06
7: Drip! 6.06
8: Drip! 8.06
9: Drip! 10.07
10: Drip! 12.07
11: Drip! 14.08
12: Drip! 16.08
13: Drip! 18.08
14: Drip! 20.09
开始时桶很快被填满,导致其余任务分散得更均匀;每 2 秒就会释放足够的容量来处理另一项任务。
最大突发大小等于最大速率值,在上面的演示中设置为5。如果您不想允许突发,请将最大速率设置为1,并将时间段设置为最小值滴水间隔时间:
>>> bucket = AsyncLeakyBucket(1, 1.5) # no bursts, drip every 1.5 seconds
>>> async def task():
... async with bucket:
... print(f'Drip! {time.time() - ref:>5.2f}')
...
>>> ref = time.time()
>>> tasks = [task() for _ in range(5)]
>>> result = asyncio.run(asyncio.wait(tasks))
Drip! 0.00
Drip! 1.50
Drip! 3.01
Drip! 4.51
Drip! 6.02
我已经开始将其打包为一个 Python 项目:https://github.com/mjpieters/aiolimiter
另一种解决方案 - 使用有界信号量 - 由同事、导师和朋友提出,如下:
import asyncio
class AsyncLeakyBucket(object):
def __init__(self, max_tasks: float, time_period: float = 60, loop: asyncio.events=None):
self._delay_time = time_period / max_tasks
self._sem = asyncio.BoundedSemaphore(max_tasks)
self._loop = loop or asyncio.get_event_loop()
self._loop.create_task(self._leak_sem())
async def _leak_sem(self):
"""
Background task that leaks semaphore releases based on the desired rate of tasks per time_period
"""
while True:
await asyncio.sleep(self._delay_time)
try:
self._sem.release()
except ValueError:
pass
async def __aenter__(self) -> None:
await self._sem.acquire()
async def __aexit__(self, exc_type, exc, tb) -> None:
pass
仍然可以使用与@Martijn 的回答
中相同的async with bucket
代码