使用 urlretrieve 从 Python 下载 xls/csv 个文件停止
Downloading xls/csv files using urlretrieve from Python stops
我正在尝试从此 ASPX 下载一堆 xls 文件 site and its folders using urlretrieve from urllib.request module in Python3.7. First, I build a txt file with the urls from the site. Then, I loop over the list and ask the server to retrieve the xls file, according to this solution here。
算法开始下载工作目录中的 xls 文件,但在 3 或 4 次迭代后,它破解了。下载的文件(3 或 4)的大小不正确(例如,它们都是 7351Kb,而不是 99Kb 或 83Kb)。令人惊讶的是,这是 txt 文件中最后 url 的大小。
有时,日志会发送一条带有 500 错误的消息。
最后一期我的 hypothesis/questions 是:
由于防火墙阻止重复调用服务器而引发错误
也许调用违反了 asynchronous/asynchronous 规则,我不知道。我使用 time.sleep 来防止错误,但它失败了。
第一期太诡异了,连着第二期
这是我的代码:
import os
import time
from random import randint
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlopen, urlretrieve, quote
url="http://informacioninteligente10.xm.com.co/transacciones/Paginas/HistoricoTransacciones.aspx"
u = urlopen(url)
try:
html = u.read().decode('utf-8')
finally:
u.close()
direcciones = [] #to be populated with urls
soup = BeautifulSoup(html)
for link in soup.select('div[webpartid] a'):
href = link.get('href')
if href.startswith('javascript:'):
continue
filename = href.rsplit('/', 1)[-1]
href = urljoin(url, quote(href))
#try:
# urlretrieve(href, filename)
#except:
# print('Downloading Error')
if any (href.endswith(x) for x in ['.xls','.xlsx','.csv']):
direcciones.append(href)
# "\n" adds a new line
direcciones = '\n'.join(direcciones)
#Save every element in a txt file
with open("file.txt", "w") as output:
output.write(direcciones)
DOWNLOADS_DIR = os.getcwd()
# For every line in the file
for url in open("file.txt"):
time.sleep(randint(0,5))
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(DOWNLOADS_DIR, name)
filename = filename[:-1] #Quitamos el espacio en blanco al final
# Download the file if it does not exist
if not os.path.isfile(filename):
urlretrieve(href, filename)
我没有使用正确的 url 解析器吗?
有什么想法吗?谢谢!
它有反机器人,你需要设置浏览器用户代理而不是默认的python用户代理
......
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0')]
urllib.request.install_opener(opener)
url=....
你必须在
中将href
替换为url
if not os.path.isfile(filename):
urlretrieve(href, filename) # must be: url
我正在尝试从此 ASPX 下载一堆 xls 文件 site and its folders using urlretrieve from urllib.request module in Python3.7. First, I build a txt file with the urls from the site. Then, I loop over the list and ask the server to retrieve the xls file, according to this solution here。
算法开始下载工作目录中的 xls 文件,但在 3 或 4 次迭代后,它破解了。下载的文件(3 或 4)的大小不正确(例如,它们都是 7351Kb,而不是 99Kb 或 83Kb)。令人惊讶的是,这是 txt 文件中最后 url 的大小。
有时,日志会发送一条带有 500 错误的消息。
最后一期我的 hypothesis/questions 是:
由于防火墙阻止重复调用服务器而引发错误
也许调用违反了 asynchronous/asynchronous 规则,我不知道。我使用 time.sleep 来防止错误,但它失败了。
第一期太诡异了,连着第二期
这是我的代码:
import os
import time
from random import randint
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlopen, urlretrieve, quote
url="http://informacioninteligente10.xm.com.co/transacciones/Paginas/HistoricoTransacciones.aspx"
u = urlopen(url)
try:
html = u.read().decode('utf-8')
finally:
u.close()
direcciones = [] #to be populated with urls
soup = BeautifulSoup(html)
for link in soup.select('div[webpartid] a'):
href = link.get('href')
if href.startswith('javascript:'):
continue
filename = href.rsplit('/', 1)[-1]
href = urljoin(url, quote(href))
#try:
# urlretrieve(href, filename)
#except:
# print('Downloading Error')
if any (href.endswith(x) for x in ['.xls','.xlsx','.csv']):
direcciones.append(href)
# "\n" adds a new line
direcciones = '\n'.join(direcciones)
#Save every element in a txt file
with open("file.txt", "w") as output:
output.write(direcciones)
DOWNLOADS_DIR = os.getcwd()
# For every line in the file
for url in open("file.txt"):
time.sleep(randint(0,5))
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(DOWNLOADS_DIR, name)
filename = filename[:-1] #Quitamos el espacio en blanco al final
# Download the file if it does not exist
if not os.path.isfile(filename):
urlretrieve(href, filename)
我没有使用正确的 url 解析器吗?
有什么想法吗?谢谢!
它有反机器人,你需要设置浏览器用户代理而不是默认的python用户代理
......
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0')]
urllib.request.install_opener(opener)
url=....
你必须在
中将href
替换为url
if not os.path.isfile(filename):
urlretrieve(href, filename) # must be: url