Python 3 使用 FancyURLopener 抓取 urllib 抛出找不到文件
Python 3 urllib scraping with FancyURLopener throws cannot find file
我正在尝试在 python 抓取工具中实施代理。
但是,我似乎无法按照我看到的教程中的建议使用 urlopen() 中的参数代理(可能是版本问题?!)
proxy = {'http' : 'http://example:8080' }
req = urllib.request.Request(Site,headers=hdr, proxies=proxy)
resp = urllib.request.urlopen(req).read()
所以我尝试从 documentation 请求中变得聪明,建议创建一个开场白。但是,这没有 headers 的参数。并提出这样的建议 opener.addheaders = []
我尝试过的方法都没有用。(代理 IP 的测试打印正在运行)
以下星座在我看来是最佳实践,但抛出 "cannot find file error"。不确定为什么。
如果你能告诉我如何将代理与完整的 header 集一起使用,那就太好了。
代码:
import bs4 as bs
import urllib.request
import ssl
import re
from pprint import pprint ## for printing out a readable dict. can be deleted afterwards
#########################################################
## Parsing with beautiful soup
#########################################################
ssl._create_default_https_context = ssl._create_unverified_context
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
Site = 'https://example.com'
proxy = {'http' : 'http://example:8080' }
def openPage(Site, hdr):
## IP check
print('Actual IP', urllib.request.urlopen('http://httpbin.org/ip').read())
req = urllib.request.Request(Site,headers=hdr)
opener = urllib.request.FancyURLopener(proxy)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
## IP check
print('Fake IP', opener.open('http://httpbin.org/ip').read())
resp = opener.open(req).read()
## soup = bs.BeautifulSoup(resp,'lxml')
## return(soup)
soup = openPage(Site,hdr)....
错误:
Traceback (most recent call last): File "C:\Program Files\Python36\lib\urllib\request.py", line 1990, in open_local_file
stats = os.stat(localname) FileNotFoundError: [WinError 2] The system cannot find the file specified: 'urllib.request.Request object at 0x000001D94816A908'
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "C:/Projects/Python/Programms/WebScraper/scraper.py", line 72, in <module>
mainNav() File "C:/Projects/Python/Programms/WebScraper/scraper.py", line 40, in mainNav
soup = openPage(Site,hdr,ean) File "C:/Projects/Python/Programms/WebScraper/scraper.py", line 32, in openPage
resp = opener.open(req).read() File "C:\Program Files\Python36\lib\urllib\request.py", line 1762, in open
return getattr(self, name)(url) File "C:\Program Files\Python36\lib\urllib\request.py", line 1981, in open_file
return self.open_local_file(url) File "C:\Program Files\Python36\lib\urllib\request.py", line 1992, in open_local_file
raise URLError(e.strerror, e.filename) urllib.error.URLError: <urlopen error The system cannot find the file specified>
以下代码已成功。我已经从 fancyURLopener 更改为使用之前定义的代理函数代理安装我自己的开启器。 header 是后来加的
def openPage(site, hdr, proxy):
## Create opener
proxy_support = urllib.request.ProxyHandler(proxy)
opener = urllib.request.build_opener(proxy_support)##proxy_support
urllib.request.install_opener(opener)
opener.addheaders = hdr
我正在尝试在 python 抓取工具中实施代理。
但是,我似乎无法按照我看到的教程中的建议使用 urlopen() 中的参数代理(可能是版本问题?!)
proxy = {'http' : 'http://example:8080' }
req = urllib.request.Request(Site,headers=hdr, proxies=proxy)
resp = urllib.request.urlopen(req).read()
所以我尝试从 documentation 请求中变得聪明,建议创建一个开场白。但是,这没有 headers 的参数。并提出这样的建议 opener.addheaders = []
我尝试过的方法都没有用。(代理 IP 的测试打印正在运行)
以下星座在我看来是最佳实践,但抛出 "cannot find file error"。不确定为什么。
如果你能告诉我如何将代理与完整的 header 集一起使用,那就太好了。
代码:
import bs4 as bs
import urllib.request
import ssl
import re
from pprint import pprint ## for printing out a readable dict. can be deleted afterwards
#########################################################
## Parsing with beautiful soup
#########################################################
ssl._create_default_https_context = ssl._create_unverified_context
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
Site = 'https://example.com'
proxy = {'http' : 'http://example:8080' }
def openPage(Site, hdr):
## IP check
print('Actual IP', urllib.request.urlopen('http://httpbin.org/ip').read())
req = urllib.request.Request(Site,headers=hdr)
opener = urllib.request.FancyURLopener(proxy)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
## IP check
print('Fake IP', opener.open('http://httpbin.org/ip').read())
resp = opener.open(req).read()
## soup = bs.BeautifulSoup(resp,'lxml')
## return(soup)
soup = openPage(Site,hdr)....
错误:
Traceback (most recent call last): File "C:\Program Files\Python36\lib\urllib\request.py", line 1990, in open_local_file
stats = os.stat(localname) FileNotFoundError: [WinError 2] The system cannot find the file specified: 'urllib.request.Request object at 0x000001D94816A908'
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "C:/Projects/Python/Programms/WebScraper/scraper.py", line 72, in <module>
mainNav() File "C:/Projects/Python/Programms/WebScraper/scraper.py", line 40, in mainNav
soup = openPage(Site,hdr,ean) File "C:/Projects/Python/Programms/WebScraper/scraper.py", line 32, in openPage
resp = opener.open(req).read() File "C:\Program Files\Python36\lib\urllib\request.py", line 1762, in open
return getattr(self, name)(url) File "C:\Program Files\Python36\lib\urllib\request.py", line 1981, in open_file
return self.open_local_file(url) File "C:\Program Files\Python36\lib\urllib\request.py", line 1992, in open_local_file
raise URLError(e.strerror, e.filename) urllib.error.URLError: <urlopen error The system cannot find the file specified>
以下代码已成功。我已经从 fancyURLopener 更改为使用之前定义的代理函数代理安装我自己的开启器。 header 是后来加的
def openPage(site, hdr, proxy):
## Create opener
proxy_support = urllib.request.ProxyHandler(proxy)
opener = urllib.request.build_opener(proxy_support)##proxy_support
urllib.request.install_opener(opener)
opener.addheaders = hdr