beautifulsoup 尝试从 google 新闻页面获取信息时出错：enthought canopy

Question

我有以下来自 the page 的代码。它工作得很好，可以打印页面内容。但是，当我将 r 更改为 google 新闻页面（以下评论）时，我收到错误消息（IOError Traceback (most recent call last)）。为什么？如何将 beautifulsoup 与 google 新闻页面一起使用？

运行良好的代码：

from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
#r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)

print soup.prettify()

产生错误的代码：

from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)

print soup.prettify()


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
c:\users\abc\appdata\local\temp\tmpvxie2e.py in <module>()
      2 import urllib
      3 r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
----> 4 r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
      5 soup = BeautifulSoup(r)
      6 print type(soup)

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in urlopen(url, data, proxies, context)
     85         opener = _urlopener
     86     if data is None:
---> 87         return opener.open(url)
     88     else:
     89         return opener.open(url, data)

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open(self, fullurl, data)
    211         try:
    212             if data is None:
--> 213                 return getattr(self, name)(url)
    214             else:
    215                 return getattr(self, name)(url, data)

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open_https(self, url, data)
    441             if realhost: h.putheader('Host', realhost)
    442             for args in self.addheaders: h.putheader(*args)
--> 443             h.endheaders(data)
    444             errcode, errmsg, headers = h.getreply()
    445             fp = h.getfile()

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in endheaders(self, message_body)
   1047         else:
   1048             raise CannotSendHeader()
-> 1049         self._send_output(message_body)
   1050 
   1051     def request(self, method, url, body=None, headers={}):

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in _send_output(self, message_body)
    891             msg += message_body
    892             message_body = None
--> 893         self.send(msg)
    894         if message_body is not None:
    895             #message_body was not a string (i.e. it is a file) and

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in send(self, data)
    853         if self.sock is None:
    854             if self.auto_open:
--> 855                 self.connect()
    856             else:
    857                 raise NotConnected()

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in connect(self)
   1272 
   1273             self.sock = self._context.wrap_socket(self.sock,
-> 1274                                                   server_hostname=server_hostname)
   1275 
   1276     __all__.append("HTTPSConnection")

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    350                          suppress_ragged_eofs=suppress_ragged_eofs,
    351                          server_hostname=server_hostname,
--> 352                          _context=self)
    353 
    354     def set_npn_protocols(self, npn_protocols):

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
    577                         # non-blocking
    578                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 579                     self.do_handshake()
    580 
    581             except (OSError, ValueError):

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in do_handshake(self, block)
    806             if timeout == 0.0 and block:
    807                 self.settimeout(None)
--> 808             self._sslobj.do_handshake()
    809         finally:
    810             self.settimeout(timeout)

IOError: [Errno socket error] EOF occurred in violation of protocol (_ssl.c:590)

更新1.

按照评论中的建议，我尝试了下面的代码，但仍然面临同样的问题:(

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
import ssl

class MyAdapter(HTTPAdapter):
    def init_poolmanager(self, connections, maxsize, block=False):
        self.poolmanager = PoolManager(num_pools=connections,
                                       maxsize=maxsize,
                                       block=block,
                                       ssl_version=ssl.PROTOCOL_TLSv1)


import requests
s = requests.Session()
s.mount('https://', MyAdapter())

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager

class SSLAdapter(HTTPAdapter):
    '''An HTTPS Transport Adapter that uses an arbitrary SSL version.'''
    def __init__(self, ssl_version=None, **kwargs):
        self.ssl_version = ssl_version

        super(SSLAdapter, self).__init__(**kwargs)

    def init_poolmanager(self, connections, maxsize, block=False):
        self.poolmanager = PoolManager(num_pools=connections,
                                       maxsize=maxsize,
                                       block=block,
                                       ssl_version=self.ssl_version)

from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)

print soup.prettify()

Answer 1

因此，问题的出现是由于使用了 Python 的 enthought canopy 版本。在大多数 Python 版本中，urllib 不检查或验证 SSL 证书。在 canopy 版本中，它似乎想要检查 SSL 证书。我没有找到有关如何实现的文档。

此外，您会在下面的代码中看到我向 BeautifulSoup 调用添加了 html.parser 参数。它本来可以按照您的方式工作，但 BeautifulSoup4 中的设置已更改，最佳做法是传递您想要使用的 which parser 参数。

以下是您的代码的工作版本，它能够获取您想要的 Google 新闻页面的 SSL 版本：

from bs4 import BeautifulSoup
import requests

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}

#r = requests.get('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts', headers=headers)
r = requests.get('https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d', headers=headers)

soup = BeautifulSoup(r.text, "html.parser")
print type(soup)

print soup.prettify()

beautifulsoup 尝试从 google 新闻页面获取信息时出错：enthought canopy

beautifulsoup error while trying to get information from google news page: enthought canopy

python

beautifulsoup

enthought

web-scraping

更新1.