使用 python 脚本从 informer.com 抓取和下载文件
Crawl and download files from informer.com using python script
出于研究目的,我需要构建一组良性程序。首先,我需要从 http://downloads.informer.com 获取这些程序。为此,我编写了一个 python 脚本来迭代每个下载页面并将下载链接提取到列表中。之后脚本使用这些链接下载程序(这些程序是 exe、msi 或 zip 文件)。不幸的是,在这一步,脚本遇到错误指出(AttributeError:'Request' object has no attribute 'decode')。
以下是在单个页面上运行并检索单个程序的脚本(为简单起见):
import wget
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://sweet-home-3d.informer.com/download'
import urllib.request
req = urllib.request.Request(
my_url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
uClient = uReq(req)
page_html = uClient.read()
page_soup = soup(page_html, 'lxml' )
cont01 = page_soup.findAll('a', {'class':'download_button'})
conts = cont01[1]
ref= conts['href']
addr = urllib.request.Request(
ref,
data=None,
headers={
'User-Agent': 'Mozilla/5.0'
}
)
wget.download(addr)
我得到的错误如下:
AttributeError Traceback (most recent call last)
<ipython-input-1-93c4caaa1777> in <module>()
31 }
32 )
---> 33 wget.download(addr)
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in download(url, out, bar)
503
504 # get filename for temp file in current directory
--> 505 prefix = detect_filename(url, out)
506 (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir=".")
507 os.close(fd)
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in detect_filename(url, out, headers, default)
482 names["out"] = out or ''
483 if url:
--> 484 names["url"] = filename_from_url(url) or ''
485 if headers:
486 names["headers"] = filename_from_headers(headers) or ''
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in filename_from_url(url)
228 """:return: detected filename as unicode or None"""
229 # [ ] test urlparse behavior with unicode url
--> 230 fname = os.path.basename(urlparse.urlparse(url).path)
231 if len(fname.strip(" \n\t.")) == 0:
232 return None
C:\Users\bander\Anaconda3\lib\urllib\parse.py in urlparse(url, scheme, allow_fragments)
292 Note that we don't break the components up in smaller bits
293 (e.g. netloc is a single string) and we don't expand % escapes."""
--> 294 url, scheme, _coerce_result = _coerce_args(url, scheme)
295 splitresult = urlsplit(url, scheme, allow_fragments)
296 scheme, netloc, url, query, fragment = splitresult
C:\Users\bander\Anaconda3\lib\urllib\parse.py in _coerce_args(*args)
112 if str_input:
113 return args + (_noop,)
--> 114 return _decode_args(args) + (_encode_result,)
115
116 # Result objects are more helpful than simple tuples
C:\Users\bander\Anaconda3\lib\urllib\parse.py in _decode_args(args, encoding, errors)
96 def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
---> 98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100 def _coerce_args(*args):
C:\Users\bander\Anaconda3\lib\urllib\parse.py in <genexpr>(.0)
96 def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
---> 98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100 def _coerce_args(*args):
AttributeError: 'Request' object has no attribute 'decode'
如果有人能帮我解决这个问题,我将不胜感激。
预先感谢。
Wget 在使用正确 URL 直接调用时给出 HTTP 错误 503:服务暂时不可用。我猜它在服务器上被阻止了。下载 link 由 JavaScript 生成。您可以使用硒。这将执行 JavaScript 以获得 URL。我尝试将 Selenium 与 PhantomJS 结合使用,但没有成功。然而 Chrome 它做到了。
首先安装 Selenium:
sudo pip3 install selenium
然后获取驱动程序 https://sites.google.com/a/chromium.org/chromedriver/downloads 并将其放在您的路径中。如果(与我不同)您在 Windows 或 Mac.
上,则可以使用 chrome "Chrome Canary" 的无头版本
from selenium import webdriver
from time import sleep
url = 'http://sweet-home-3d.informer.com/download'
browser = webdriver.Chrome()
browser.get(url)
browser.find_element_by_class_name("download_btn").click()
sleep(360) # give it plenty of time to download this will depend on you internet connection
browser.quit()
文件将下载到您的“下载”文件夹中。如果它退出得太快,您将获得带有额外文件扩展名 .crdownload 的文件的一部分。如果发生这种情况,请增加您传递给睡眠的值。
实际上您不需要为此使用 Selenium。这是一个cookie问题。我相信你也可以用 urllib 以某种方式做 cookie,但这不是我的专业领域。
如果您要完成这项工作 - 没有浏览器和 wget - 在请求中,您可以像这样获取文件:
import requests
from bs4 import BeautifulSoup as bs
# you need headers or the site won't let you grab the data
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3181.0 Safari/537.36"
}
url = 'http://sweet-home-3d.informer.com/download/'
# you need a cookie to download. Create a persistens session
s = requests.Session()
r = s.get(url, headers=headers)
soup = bs(r.text, "html.parser")
# all download options lie in a div with class table
links_table = soup.find('div', {'class': 'table'})
file_name = links_table.find('div', {'class': 'table-cell file_name'})['title']
download_link = links_table.find('a', {'class': 'download_button'})['href']
# for some reason the url-page doesn't set the cookie you need.
# the subpages do, so we need to get it from one of them - before we call download_link
cookie_link = links_table.a['href']
r = s.get(cookie_link, headers=headers)
# now with a cookie set, we can download the file
r = s.get(download_link,headers=headers)
with open(file_name, 'wb') as f:
f.write(r.content)
出于研究目的,我需要构建一组良性程序。首先,我需要从 http://downloads.informer.com 获取这些程序。为此,我编写了一个 python 脚本来迭代每个下载页面并将下载链接提取到列表中。之后脚本使用这些链接下载程序(这些程序是 exe、msi 或 zip 文件)。不幸的是,在这一步,脚本遇到错误指出(AttributeError:'Request' object has no attribute 'decode')。
以下是在单个页面上运行并检索单个程序的脚本(为简单起见):
import wget
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://sweet-home-3d.informer.com/download'
import urllib.request
req = urllib.request.Request(
my_url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
uClient = uReq(req)
page_html = uClient.read()
page_soup = soup(page_html, 'lxml' )
cont01 = page_soup.findAll('a', {'class':'download_button'})
conts = cont01[1]
ref= conts['href']
addr = urllib.request.Request(
ref,
data=None,
headers={
'User-Agent': 'Mozilla/5.0'
}
)
wget.download(addr)
我得到的错误如下:
AttributeError Traceback (most recent call last)
<ipython-input-1-93c4caaa1777> in <module>()
31 }
32 )
---> 33 wget.download(addr)
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in download(url, out, bar)
503
504 # get filename for temp file in current directory
--> 505 prefix = detect_filename(url, out)
506 (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir=".")
507 os.close(fd)
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in detect_filename(url, out, headers, default)
482 names["out"] = out or ''
483 if url:
--> 484 names["url"] = filename_from_url(url) or ''
485 if headers:
486 names["headers"] = filename_from_headers(headers) or ''
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in filename_from_url(url)
228 """:return: detected filename as unicode or None"""
229 # [ ] test urlparse behavior with unicode url
--> 230 fname = os.path.basename(urlparse.urlparse(url).path)
231 if len(fname.strip(" \n\t.")) == 0:
232 return None
C:\Users\bander\Anaconda3\lib\urllib\parse.py in urlparse(url, scheme, allow_fragments)
292 Note that we don't break the components up in smaller bits
293 (e.g. netloc is a single string) and we don't expand % escapes."""
--> 294 url, scheme, _coerce_result = _coerce_args(url, scheme)
295 splitresult = urlsplit(url, scheme, allow_fragments)
296 scheme, netloc, url, query, fragment = splitresult
C:\Users\bander\Anaconda3\lib\urllib\parse.py in _coerce_args(*args)
112 if str_input:
113 return args + (_noop,)
--> 114 return _decode_args(args) + (_encode_result,)
115
116 # Result objects are more helpful than simple tuples
C:\Users\bander\Anaconda3\lib\urllib\parse.py in _decode_args(args, encoding, errors)
96 def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
---> 98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100 def _coerce_args(*args):
C:\Users\bander\Anaconda3\lib\urllib\parse.py in <genexpr>(.0)
96 def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
---> 98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100 def _coerce_args(*args):
AttributeError: 'Request' object has no attribute 'decode'
如果有人能帮我解决这个问题,我将不胜感激。 预先感谢。
Wget 在使用正确 URL 直接调用时给出 HTTP 错误 503:服务暂时不可用。我猜它在服务器上被阻止了。下载 link 由 JavaScript 生成。您可以使用硒。这将执行 JavaScript 以获得 URL。我尝试将 Selenium 与 PhantomJS 结合使用,但没有成功。然而 Chrome 它做到了。
首先安装 Selenium:
sudo pip3 install selenium
然后获取驱动程序 https://sites.google.com/a/chromium.org/chromedriver/downloads 并将其放在您的路径中。如果(与我不同)您在 Windows 或 Mac.
上,则可以使用 chrome "Chrome Canary" 的无头版本from selenium import webdriver
from time import sleep
url = 'http://sweet-home-3d.informer.com/download'
browser = webdriver.Chrome()
browser.get(url)
browser.find_element_by_class_name("download_btn").click()
sleep(360) # give it plenty of time to download this will depend on you internet connection
browser.quit()
文件将下载到您的“下载”文件夹中。如果它退出得太快,您将获得带有额外文件扩展名 .crdownload 的文件的一部分。如果发生这种情况,请增加您传递给睡眠的值。
实际上您不需要为此使用 Selenium。这是一个cookie问题。我相信你也可以用 urllib 以某种方式做 cookie,但这不是我的专业领域。
如果您要完成这项工作 - 没有浏览器和 wget - 在请求中,您可以像这样获取文件:
import requests
from bs4 import BeautifulSoup as bs
# you need headers or the site won't let you grab the data
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3181.0 Safari/537.36"
}
url = 'http://sweet-home-3d.informer.com/download/'
# you need a cookie to download. Create a persistens session
s = requests.Session()
r = s.get(url, headers=headers)
soup = bs(r.text, "html.parser")
# all download options lie in a div with class table
links_table = soup.find('div', {'class': 'table'})
file_name = links_table.find('div', {'class': 'table-cell file_name'})['title']
download_link = links_table.find('a', {'class': 'download_button'})['href']
# for some reason the url-page doesn't set the cookie you need.
# the subpages do, so we need to get it from one of them - before we call download_link
cookie_link = links_table.a['href']
r = s.get(cookie_link, headers=headers)
# now with a cookie set, we can download the file
r = s.get(download_link,headers=headers)
with open(file_name, 'wb') as f:
f.write(r.content)