使用 python 从链接下载
Download from links using python
我想从这个website下载数据。
我检查了源代码,发现它使用以下link格式下载数据。
url = 'http://current.hydro.gov.hk/en/download_csv.php?start_dt={}%20{}:00&end_dt={}%20{}:00&mode=Surface'
url_filled = url.format("2018-01-02", "00:00", "2018-01-02", "23:45")
然后我尝试使用request下载CSV数据。
import requests
r = requests.get(url_filled)
但后来我收到错误消息。
TimeoutError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in _new_conn(self)
140 conn = connection.create_connection(
--> 141 (self.host, self.port), self.timeout, **extra_kw)
142
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
82 if err is not None:
---> 83 raise err
84
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
72 sock.bind(source_address)
---> 73 sock.connect(sa)
74 return sock
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
355 else:
--> 356 conn.request(method, url, **httplib_request_kw)
357
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in request(self, method, url, body, headers)
1106 """Send a complete request to the server."""
-> 1107 self._send_request(method, url, body, headers)
1108
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers)
1151 body = _encode(body, 'body')
-> 1152 self.endheaders(body)
1153
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in endheaders(self, message_body)
1102 raise CannotSendHeader()
-> 1103 self._send_output(message_body)
1104
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in _send_output(self, message_body)
933
--> 934 self.send(msg)
935 if message_body is not None:
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in send(self, data)
876 if self.auto_open:
--> 877 self.connect()
878 else:
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in connect(self)
165 def connect(self):
--> 166 conn = self._new_conn()
167 self._prepare_conn(conn)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in _new_conn(self)
149 raise NewConnectionError(
--> 150 self, "Failed to establish a new connection: %s" % e)
151
NewConnectionError: <requests.packages.urllib3.connection.HTTPConnection object at 0x00000000081F5D30>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
437 retries=self.max_retries,
--> 438 timeout=timeout
439 )
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
648 retries = retries.increment(method, url, error=e, _pool=self,
--> 649 _stacktrace=sys.exc_info()[2])
650 retries.sleep()
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
387 if new_retry.is_exhausted():
--> 388 raise MaxRetryError(_pool, url, error or ResponseError(cause))
389
MaxRetryError: HTTPConnectionPool(host='current.hydro.gov.hk', port=80): Max retries exceeded with url: /en/download_csv.php?start_dt=2018-01-02%2000:00:00&end_dt=2018-01-02%2023:45:00&mode=Surface (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000081F5D30>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-39-ac5f4cccaa6a> in <module>()
----> 1 r = requests.get(url_filled)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
70
71 kwargs.setdefault('allow_redirects', True)
---> 72 return request('get', url, params=params, **kwargs)
73
74
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
56 # cases, and look like a memory leak in others.
57 with sessions.Session() as session:
---> 58 return session.request(method=method, url=url, **kwargs)
59
60
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
516 }
517 send_kwargs.update(settings)
--> 518 resp = self.send(prep, **send_kwargs)
519
520 return resp
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
637
638 # Send the request
--> 639 r = adapter.send(request, **kwargs)
640
641 # Total elapsed time of the request (approximately)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
500 raise ProxyError(e, request=request)
501
--> 502 raise ConnectionError(e, request=request)
503
504 except ClosedPoolError as e:
ConnectionError: HTTPConnectionPool(host='current.hydro.gov.hk', port=80): Max retries exceeded with url: /en/download_csv.php?start_dt=2018-01-02%2000:00:00&end_dt=2018-01-02%2023:45:00&mode=Surface (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000081F5D30>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
当我尝试打开 google chrome 中的 link 时,它起作用了。出现一个对话框,询问我下载位置。
有人可以帮忙吗?谢谢
The server of the webpage sends the packets of the file slowly. We need to take into account that. So, we can use the chunking mechanism of the request.get
.
import requests
# url of the csv file
csv_url = 'http://current.hydro.gov.hk/en/download_csv.php?start_dt=' + \
'2018-01-05%2000:00:00&end_dt=2018-01-06%2000:00:00&mode=Surface'
# sample csv filename
csv_filename = 'test.csv'
# use stream mode for chunking
csv_body = requests.get(csv_url, stream=True)
with open(csv_filename, 'wb') as fd:
for chunk in csv_body.iter_content(chunk_size=1024):
fd.write(chunk)
您要通过代理服务器吗?如果是这样,您可以查看 http://docs.python-requests.org/en/master/user/advanced/ 中的代理部分。我试过你的代码,它工作正常
我想从这个website下载数据。
我检查了源代码,发现它使用以下link格式下载数据。
url = 'http://current.hydro.gov.hk/en/download_csv.php?start_dt={}%20{}:00&end_dt={}%20{}:00&mode=Surface'
url_filled = url.format("2018-01-02", "00:00", "2018-01-02", "23:45")
然后我尝试使用request下载CSV数据。
import requests
r = requests.get(url_filled)
但后来我收到错误消息。
TimeoutError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in _new_conn(self)
140 conn = connection.create_connection(
--> 141 (self.host, self.port), self.timeout, **extra_kw)
142
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
82 if err is not None:
---> 83 raise err
84
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
72 sock.bind(source_address)
---> 73 sock.connect(sa)
74 return sock
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
355 else:
--> 356 conn.request(method, url, **httplib_request_kw)
357
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in request(self, method, url, body, headers)
1106 """Send a complete request to the server."""
-> 1107 self._send_request(method, url, body, headers)
1108
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers)
1151 body = _encode(body, 'body')
-> 1152 self.endheaders(body)
1153
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in endheaders(self, message_body)
1102 raise CannotSendHeader()
-> 1103 self._send_output(message_body)
1104
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in _send_output(self, message_body)
933
--> 934 self.send(msg)
935 if message_body is not None:
~\AppData\Local\Continuum\Anaconda3\lib\http\client.py in send(self, data)
876 if self.auto_open:
--> 877 self.connect()
878 else:
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in connect(self)
165 def connect(self):
--> 166 conn = self._new_conn()
167 self._prepare_conn(conn)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in _new_conn(self)
149 raise NewConnectionError(
--> 150 self, "Failed to establish a new connection: %s" % e)
151
NewConnectionError: <requests.packages.urllib3.connection.HTTPConnection object at 0x00000000081F5D30>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
437 retries=self.max_retries,
--> 438 timeout=timeout
439 )
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
648 retries = retries.increment(method, url, error=e, _pool=self,
--> 649 _stacktrace=sys.exc_info()[2])
650 retries.sleep()
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
387 if new_retry.is_exhausted():
--> 388 raise MaxRetryError(_pool, url, error or ResponseError(cause))
389
MaxRetryError: HTTPConnectionPool(host='current.hydro.gov.hk', port=80): Max retries exceeded with url: /en/download_csv.php?start_dt=2018-01-02%2000:00:00&end_dt=2018-01-02%2023:45:00&mode=Surface (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000081F5D30>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-39-ac5f4cccaa6a> in <module>()
----> 1 r = requests.get(url_filled)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
70
71 kwargs.setdefault('allow_redirects', True)
---> 72 return request('get', url, params=params, **kwargs)
73
74
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
56 # cases, and look like a memory leak in others.
57 with sessions.Session() as session:
---> 58 return session.request(method=method, url=url, **kwargs)
59
60
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
516 }
517 send_kwargs.update(settings)
--> 518 resp = self.send(prep, **send_kwargs)
519
520 return resp
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
637
638 # Send the request
--> 639 r = adapter.send(request, **kwargs)
640
641 # Total elapsed time of the request (approximately)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
500 raise ProxyError(e, request=request)
501
--> 502 raise ConnectionError(e, request=request)
503
504 except ClosedPoolError as e:
ConnectionError: HTTPConnectionPool(host='current.hydro.gov.hk', port=80): Max retries exceeded with url: /en/download_csv.php?start_dt=2018-01-02%2000:00:00&end_dt=2018-01-02%2023:45:00&mode=Surface (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000081F5D30>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
当我尝试打开 google chrome 中的 link 时,它起作用了。出现一个对话框,询问我下载位置。
有人可以帮忙吗?谢谢
The server of the webpage sends the packets of the file slowly. We need to take into account that. So, we can use the chunking mechanism of the
request.get
.
import requests
# url of the csv file
csv_url = 'http://current.hydro.gov.hk/en/download_csv.php?start_dt=' + \
'2018-01-05%2000:00:00&end_dt=2018-01-06%2000:00:00&mode=Surface'
# sample csv filename
csv_filename = 'test.csv'
# use stream mode for chunking
csv_body = requests.get(csv_url, stream=True)
with open(csv_filename, 'wb') as fd:
for chunk in csv_body.iter_content(chunk_size=1024):
fd.write(chunk)
您要通过代理服务器吗?如果是这样,您可以查看 http://docs.python-requests.org/en/master/user/advanced/ 中的代理部分。我试过你的代码,它工作正常