发布到龙卷风 HTTP 服务器时间歇性 requests.exceptions.ConnectionError?
Intermittent requests.exceptions.ConnectionError when posting to tornado HTTP server?
这个程序创建了 500 个客户端进程和一个 Tornado HTTP 服务器。每个客户端向服务器执行一个 POST。有时我会遇到 requests.exceptions.ConnectionError 异常。我添加了重试代码来补偿。错开客户端启动时间也有帮助。
我认为 requests
和 tornado
模块应该能够在没有额外编码的情况下处理这个问题。我不确定是哪个原因造成的。
#! /usr/bin/env python3
import time
import multiprocessing
import random
import requests
import tornado.ioloop
import tornado.web
class Client(multiprocessing.Process):
def __init__(self, client_id):
multiprocessing.Process.__init__(self)
self.client_id = client_id
self.host = 'http://localhost:8888/log'
self.sample_record = 'x'*300
self.start()
def run(self):
# Wait until top of current 5 sec interval so all clients start together.
time.sleep(5 - time.time()%5)
#time.sleep(random.random()) # This seems to eliminate the problem.
# requests.Session will do keep-alive by default.
session = requests.Session()
payload = { 'record': 'x'*300 }
attempt_count = 0
while True:
try:
response = session.post(self.host, data=payload, timeout=10)
break
except requests.exceptions.ConnectionError:
print('Retry: id=', self.client_id)
attempt_count += 1
assert attempt_count < 10
continue
assert response.status_code == 200 and response.text == 'Success'
post_received_count = 0
class LogHandler(tornado.web.RequestHandler):
def post(self):
global post_received_count
post_received_count += 1
if post_received_count%100 == 0:
print('post count=', post_received_count)
self.write('Success')
def make_app():
return tornado.web.Application(
[
(r"/log", LogHandler), # http://localhost:8888/log
],
cookie_secret = "__TODO:_GENERATE_YOUR_OWN_RANDOM_VALUE_HERE__",
)
if __name__ == "__main__":
time.sleep(6 - time.time()%5) # Wait until one sec into next 5 sec interval.
print('Start clients...')
client_list = [Client(i) for i in range(500)]
print('Done.')
app = make_app()
app.listen(8888)
try:
tornado.ioloop.IOLoop.current().start()
except KeyboardInterrupt:
pass
Linux 上两个 运行 的输出。第一个没问题运行。该程序打印执行的 POST 的数量。它在 500 POST 秒后停止。然后我使用 Ctrl-\ 终止服务器。需要在第二次尝试时重试的多个 ConnectionError 异常。
如果我取消对 time.sleep(random.random())
语句的注释,它始终 运行s 无一例外。这是 0-1 秒之间的随机睡眠。显然只有当太多客户端同时提交请求时才会出现问题。
$ ulimit -n 10000
$ test_log_server.py
Start clients...
Done.
post count= 100
post count= 200
post count= 300
post count= 400
post count= 500
^\Quit (core dumped)
$ test_log_server.py
Start clients...
Done.
post count= 100
Retry: id= 223
Retry: id= 340
Retry: id= 116
Retry: id= 164
Retry: id= 258
Retry: id= 150
Retry: id= 290
Retry: id= 16
Retry: id= 40
Retry: id= 5
post count= 200
post count= 300
post count= 400
post count= 500
^\Quit (core dumped)
$
如果删除 try 块,我会得到以下异常。这是来自 500 个客户之一。其他客户报告了相同的异常。
Process Client-102:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 601, in urlopen
chunked=chunked)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.6/http/client.py", line 1373, in getresponse
response.begin()
File "/usr/lib/python3.6/http/client.py", line 311, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.6/http/client.py", line 272, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 440, in send
timeout=timeout
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 367, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/lib/python3/dist-packages/six.py", line 692, in reraise
raise value.with_traceback(tb)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 601, in urlopen
chunked=chunked)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.6/http/client.py", line 1373, in getresponse
response.begin()
File "/usr/lib/python3.6/http/client.py", line 311, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.6/http/client.py", line 272, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "./test_log_server.py", line 31, in run
response = session.post(self.host, data=payload, timeout=10)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 567, in post
return self.request('POST', url, data=data, json=json, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 520, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 630, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 490, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Apparently the problem is only when too many clients submit a request at the same time.
500 个客户端进程很多。一种可能是socket listen backlog,使用app.listen
接口时设置为128。在较旧的内核版本中,128 是此处允许的最大值,但最近可以增加它(取决于您的内核版本和其他配置)。尝试将 app.listen(8888)
替换为:
server = tornado.httpserver.HTTPServer(app)
server.bind(8888, backlog=4096)
server.start()
我还建议,如果您真的需要支持同时启动这么多连接,您可能希望 运行 多个服务器进程,而不是仅仅增加单个进程的套接字积压。
这个程序创建了 500 个客户端进程和一个 Tornado HTTP 服务器。每个客户端向服务器执行一个 POST。有时我会遇到 requests.exceptions.ConnectionError 异常。我添加了重试代码来补偿。错开客户端启动时间也有帮助。
我认为 requests
和 tornado
模块应该能够在没有额外编码的情况下处理这个问题。我不确定是哪个原因造成的。
#! /usr/bin/env python3
import time
import multiprocessing
import random
import requests
import tornado.ioloop
import tornado.web
class Client(multiprocessing.Process):
def __init__(self, client_id):
multiprocessing.Process.__init__(self)
self.client_id = client_id
self.host = 'http://localhost:8888/log'
self.sample_record = 'x'*300
self.start()
def run(self):
# Wait until top of current 5 sec interval so all clients start together.
time.sleep(5 - time.time()%5)
#time.sleep(random.random()) # This seems to eliminate the problem.
# requests.Session will do keep-alive by default.
session = requests.Session()
payload = { 'record': 'x'*300 }
attempt_count = 0
while True:
try:
response = session.post(self.host, data=payload, timeout=10)
break
except requests.exceptions.ConnectionError:
print('Retry: id=', self.client_id)
attempt_count += 1
assert attempt_count < 10
continue
assert response.status_code == 200 and response.text == 'Success'
post_received_count = 0
class LogHandler(tornado.web.RequestHandler):
def post(self):
global post_received_count
post_received_count += 1
if post_received_count%100 == 0:
print('post count=', post_received_count)
self.write('Success')
def make_app():
return tornado.web.Application(
[
(r"/log", LogHandler), # http://localhost:8888/log
],
cookie_secret = "__TODO:_GENERATE_YOUR_OWN_RANDOM_VALUE_HERE__",
)
if __name__ == "__main__":
time.sleep(6 - time.time()%5) # Wait until one sec into next 5 sec interval.
print('Start clients...')
client_list = [Client(i) for i in range(500)]
print('Done.')
app = make_app()
app.listen(8888)
try:
tornado.ioloop.IOLoop.current().start()
except KeyboardInterrupt:
pass
Linux 上两个 运行 的输出。第一个没问题运行。该程序打印执行的 POST 的数量。它在 500 POST 秒后停止。然后我使用 Ctrl-\ 终止服务器。需要在第二次尝试时重试的多个 ConnectionError 异常。
如果我取消对 time.sleep(random.random())
语句的注释,它始终 运行s 无一例外。这是 0-1 秒之间的随机睡眠。显然只有当太多客户端同时提交请求时才会出现问题。
$ ulimit -n 10000
$ test_log_server.py
Start clients...
Done.
post count= 100
post count= 200
post count= 300
post count= 400
post count= 500
^\Quit (core dumped)
$ test_log_server.py
Start clients...
Done.
post count= 100
Retry: id= 223
Retry: id= 340
Retry: id= 116
Retry: id= 164
Retry: id= 258
Retry: id= 150
Retry: id= 290
Retry: id= 16
Retry: id= 40
Retry: id= 5
post count= 200
post count= 300
post count= 400
post count= 500
^\Quit (core dumped)
$
如果删除 try 块,我会得到以下异常。这是来自 500 个客户之一。其他客户报告了相同的异常。
Process Client-102:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 601, in urlopen
chunked=chunked)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.6/http/client.py", line 1373, in getresponse
response.begin()
File "/usr/lib/python3.6/http/client.py", line 311, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.6/http/client.py", line 272, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 440, in send
timeout=timeout
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 367, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/lib/python3/dist-packages/six.py", line 692, in reraise
raise value.with_traceback(tb)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 601, in urlopen
chunked=chunked)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.6/http/client.py", line 1373, in getresponse
response.begin()
File "/usr/lib/python3.6/http/client.py", line 311, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.6/http/client.py", line 272, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "./test_log_server.py", line 31, in run
response = session.post(self.host, data=payload, timeout=10)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 567, in post
return self.request('POST', url, data=data, json=json, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 520, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 630, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 490, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Apparently the problem is only when too many clients submit a request at the same time.
500 个客户端进程很多。一种可能是socket listen backlog,使用app.listen
接口时设置为128。在较旧的内核版本中,128 是此处允许的最大值,但最近可以增加它(取决于您的内核版本和其他配置)。尝试将 app.listen(8888)
替换为:
server = tornado.httpserver.HTTPServer(app)
server.bind(8888, backlog=4096)
server.start()
我还建议,如果您真的需要支持同时启动这么多连接,您可能希望 运行 多个服务器进程,而不是仅仅增加单个进程的套接字积压。