beautifulsoup 尝试从 google 新闻页面获取信息时出错:enthought canopy
beautifulsoup error while trying to get information from google news page: enthought canopy
我有以下来自 the page 的代码。它工作得很好,可以打印页面内容。但是,当我将 r 更改为 google 新闻页面(以下评论)时,我收到错误消息(IOError Traceback (most recent call last)
)。为什么?如何将 beautifulsoup 与 google 新闻页面一起使用?
运行良好的代码:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
#r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
产生错误的代码:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
c:\users\abc\appdata\local\temp\tmpvxie2e.py in <module>()
2 import urllib
3 r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
----> 4 r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
5 soup = BeautifulSoup(r)
6 print type(soup)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in urlopen(url, data, proxies, context)
85 opener = _urlopener
86 if data is None:
---> 87 return opener.open(url)
88 else:
89 return opener.open(url, data)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open(self, fullurl, data)
211 try:
212 if data is None:
--> 213 return getattr(self, name)(url)
214 else:
215 return getattr(self, name)(url, data)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open_https(self, url, data)
441 if realhost: h.putheader('Host', realhost)
442 for args in self.addheaders: h.putheader(*args)
--> 443 h.endheaders(data)
444 errcode, errmsg, headers = h.getreply()
445 fp = h.getfile()
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in send(self, data)
853 if self.sock is None:
854 if self.auto_open:
--> 855 self.connect()
856 else:
857 raise NotConnected()
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in connect(self)
1272
1273 self.sock = self._context.wrap_socket(self.sock,
-> 1274 server_hostname=server_hostname)
1275
1276 __all__.append("HTTPSConnection")
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
350 suppress_ragged_eofs=suppress_ragged_eofs,
351 server_hostname=server_hostname,
--> 352 _context=self)
353
354 def set_npn_protocols(self, npn_protocols):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
577 # non-blocking
578 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 579 self.do_handshake()
580
581 except (OSError, ValueError):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in do_handshake(self, block)
806 if timeout == 0.0 and block:
807 self.settimeout(None)
--> 808 self._sslobj.do_handshake()
809 finally:
810 self.settimeout(timeout)
IOError: [Errno socket error] EOF occurred in violation of protocol (_ssl.c:590)
更新1.
按照评论中的建议,我尝试了下面的代码,但仍然面临同样的问题:(
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
import ssl
class MyAdapter(HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_TLSv1)
import requests
s = requests.Session()
s.mount('https://', MyAdapter())
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
class SSLAdapter(HTTPAdapter):
'''An HTTPS Transport Adapter that uses an arbitrary SSL version.'''
def __init__(self, ssl_version=None, **kwargs):
self.ssl_version = ssl_version
super(SSLAdapter, self).__init__(**kwargs)
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=self.ssl_version)
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
因此,问题的出现是由于使用了 Python 的 enthought canopy 版本。在大多数 Python 版本中,urllib 不检查或验证 SSL 证书。在 canopy 版本中,它似乎想要检查 SSL 证书。我没有找到有关如何实现的文档。
此外,您会在下面的代码中看到我向 BeautifulSoup 调用添加了 html.parser
参数。它本来可以按照您的方式工作,但 BeautifulSoup4 中的设置已更改,最佳做法是传递您想要使用的 which parser 参数。
以下是您的代码的工作版本,它能够获取您想要的 Google 新闻页面的 SSL 版本:
from bs4 import BeautifulSoup
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
#r = requests.get('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts', headers=headers)
r = requests.get('https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d', headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
print type(soup)
print soup.prettify()
我有以下来自 the page 的代码。它工作得很好,可以打印页面内容。但是,当我将 r 更改为 google 新闻页面(以下评论)时,我收到错误消息(IOError Traceback (most recent call last)
)。为什么?如何将 beautifulsoup 与 google 新闻页面一起使用?
运行良好的代码:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
#r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
产生错误的代码:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
c:\users\abc\appdata\local\temp\tmpvxie2e.py in <module>()
2 import urllib
3 r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
----> 4 r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
5 soup = BeautifulSoup(r)
6 print type(soup)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in urlopen(url, data, proxies, context)
85 opener = _urlopener
86 if data is None:
---> 87 return opener.open(url)
88 else:
89 return opener.open(url, data)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open(self, fullurl, data)
211 try:
212 if data is None:
--> 213 return getattr(self, name)(url)
214 else:
215 return getattr(self, name)(url, data)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open_https(self, url, data)
441 if realhost: h.putheader('Host', realhost)
442 for args in self.addheaders: h.putheader(*args)
--> 443 h.endheaders(data)
444 errcode, errmsg, headers = h.getreply()
445 fp = h.getfile()
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in send(self, data)
853 if self.sock is None:
854 if self.auto_open:
--> 855 self.connect()
856 else:
857 raise NotConnected()
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in connect(self)
1272
1273 self.sock = self._context.wrap_socket(self.sock,
-> 1274 server_hostname=server_hostname)
1275
1276 __all__.append("HTTPSConnection")
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
350 suppress_ragged_eofs=suppress_ragged_eofs,
351 server_hostname=server_hostname,
--> 352 _context=self)
353
354 def set_npn_protocols(self, npn_protocols):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
577 # non-blocking
578 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 579 self.do_handshake()
580
581 except (OSError, ValueError):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in do_handshake(self, block)
806 if timeout == 0.0 and block:
807 self.settimeout(None)
--> 808 self._sslobj.do_handshake()
809 finally:
810 self.settimeout(timeout)
IOError: [Errno socket error] EOF occurred in violation of protocol (_ssl.c:590)
更新1.
按照评论中的建议,我尝试了下面的代码,但仍然面临同样的问题:(
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
import ssl
class MyAdapter(HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_TLSv1)
import requests
s = requests.Session()
s.mount('https://', MyAdapter())
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
class SSLAdapter(HTTPAdapter):
'''An HTTPS Transport Adapter that uses an arbitrary SSL version.'''
def __init__(self, ssl_version=None, **kwargs):
self.ssl_version = ssl_version
super(SSLAdapter, self).__init__(**kwargs)
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=self.ssl_version)
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
因此,问题的出现是由于使用了 Python 的 enthought canopy 版本。在大多数 Python 版本中,urllib 不检查或验证 SSL 证书。在 canopy 版本中,它似乎想要检查 SSL 证书。我没有找到有关如何实现的文档。
此外,您会在下面的代码中看到我向 BeautifulSoup 调用添加了 html.parser
参数。它本来可以按照您的方式工作,但 BeautifulSoup4 中的设置已更改,最佳做法是传递您想要使用的 which parser 参数。
以下是您的代码的工作版本,它能够获取您想要的 Google 新闻页面的 SSL 版本:
from bs4 import BeautifulSoup
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
#r = requests.get('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts', headers=headers)
r = requests.get('https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d', headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
print type(soup)
print soup.prettify()