Python 通过 Tor 请求流 - 连接终止
Python Requests Stream Via Tor - Connection Dies
我正在使用 python 请求库以多处理方式在 'onionurl' 下载文件,从 tor 服务下载大量文件。
这就是代码背后的原因。
然而,在下载这些文件时,它们会在一两分钟后中断。由于在流中下载失败,没有给出错误,但返回 'closing text file'。这意味着不可能下载托管在这些洋葱服务器上的文件,每个服务器有数百 GB。
如能帮助解决此问题,我们将不胜感激。
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
url = onionurl
try:
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
with session.get(url, stream=True, verify=False, timeout=1000000) as response:
# get the total file size
file_size = int(response.headers.get("Content-Length", 0))
print(file_size)
# get the file name
filename = dataloc
with open(filename, "wb") as text_file:
for chunk in response.iter_content(chunk_size=1024):
text_file.write(chunk)
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print("closing text file")
text_file.close()
通过简单地接受连接将终止并编写一个新函数在确切的偏移处恢复下载来设法解决它,这个问题中解释了其理论 - How to resume file download in Python?
我的代码(警告,凌乱):
def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
print("rerunning")
companyname = onionurl[0]
onionurl = onionurl[1]
dataloc = '/media/archangel/Elements/clop/dataleaks/'
foldername = dataloc
dataloc = dataloc + companyname + "/"
try:
if not os.path.isdir(dataloc):
os.mkdir(dataloc)
except Exception as e:
print(e)
print("folder not created")
filename = os.path.basename(onionurl)
filenamebasename = filename
dataloc = dataloc + filename
try:
# seconds = 20
# timeout = Timeout(seconds)
# timeout.start()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
# onionurlforrequest = "http://" + onionurl
print("dataloc")
print(dataloc)
print("onionurl")
print(onionurl)
url = onionurl
try:
print("url")
print(url)
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
#
try:
try:
seconds = 20
timeout = Timeout(seconds)
timeout.start()
except Exception as ex:
print(ex)
resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
try:
with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
#response.raise_for_status()
# get the total file size
file_size = int(response.headers['Content-Length'])
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print(file_size)
# get the file name
filename = dataloc
# filename = os.path.join(dataloc, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
# response = session.get(url, stream = True)
# progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
try:
with open(filename, "ab") as text_file:
for chunk in response.iter_content(chunk_size=1024*1024):
#https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
#if len(chunk) != 1024*36:
if chunk:
#print(len(chunk))
text_file.write(chunk)
text_file.flush()
except Exception as ex:
logging.error(f'write failed with error: {ex}')
print(ex)
#else:
# write data read to the file
# f.write(data)
# update the progress bar manually
# progress.update(len(data))
# finally, if the url is valid
#logging.info('Download finished successfully')
print("exited with for file")
except Exception as ex:
logging.error(f'Request failed with error: {ex}')
print(ex)
except Exception as ex:
logging.error(f'Attempt failed with error: {ex}')
print(ex)
print("closing text file")
# text_file.close()
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
except Exception as e:
print("FAILED DOWNLOAD 2")
print(e)
except Exception as e:
print("FAILED DOWNLOAD 5")
print(e)
def onionrequestthreadeddataleakdownload2(onionurl):
companyname = onionurl[0]
onionurl = onionurl[1]
dataloc = '/media/archangel/Elements/clop/dataleaks/'
foldername = dataloc
dataloc = dataloc + companyname + "/"
try:
if not os.path.isdir(dataloc):
os.mkdir(dataloc)
except Exception as e:
print(e)
print("folder not created")
filename = os.path.basename(onionurl)
filenamebasename = filename
dataloc = dataloc + filename
try:
# seconds = 20
# timeout = Timeout(seconds)
# timeout.start()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
# onionurlforrequest = "http://" + onionurl
print("dataloc")
print(dataloc)
print("onionurl")
print(onionurl)
url = onionurl
try:
print("url")
print(url)
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
#
try:
try:
seconds = 20
timeout = Timeout(seconds)
timeout.start()
except Exception as ex:
print(ex)
# resume_header = ({'Range': f'bytes=0-2000000'})
#file_size_online = int(r.headers.get('content-length', 0))
headersac = {'Accept-Encoding': None}
try:
with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
#response.raise_for_status()
# get the total file size
# file_size = int(response.headers.get("Content-Length", 0))
file_size = int(response.headers['Content-Length'])
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print(file_size)
#e
# get the file name
filename = dataloc
# filename = os.path.join(dataloc, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
# response = session.get(url, stream = True)
# progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
try:
with open(filename, "wb") as text_file:
for chunk in response.iter_content(chunk_size=1024*1024):
#https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
#if len(chunk) != 1024*36:
if chunk:
# print(len(chunk))
text_file.write(chunk)
text_file.flush()
except Exception as ex:
logging.error(f'write failed with error: {ex}')
print(ex)
#else:
# write data read to the file
# f.write(data)
# update the progress bar manually
# progress.update(len(data))
# finally, if the url is valid
#logging.info('Download finished successfully')
except Exception as ex:
logging.error(f'request failed with error: {ex}')
print(ex)
print("exited with for file")
#path = Path(filename)
file_size_offline = Path(filename).stat().st_size
print("file size offline")
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
print("LOOP FINISHED")
print(file_size)
print(file_size_offline)
print(filename)
except Exception as ex:
logging.error(f'Attempt failed with error: {ex}')
print(ex)
# print("closing text file")
# text_file.close()
if(file_size_offline != file_size):
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
else:
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
if(file_size_offline != file_size):
print("rerunning a final FINAL time")
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
else:
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
except Exception as e:
print("FAILED DOWNLOAD 2")
print(e)
except Exception as e:
print("FAILED DOWNLOAD 5")
print(e)
我正在使用 python 请求库以多处理方式在 'onionurl' 下载文件,从 tor 服务下载大量文件。
这就是代码背后的原因。
然而,在下载这些文件时,它们会在一两分钟后中断。由于在流中下载失败,没有给出错误,但返回 'closing text file'。这意味着不可能下载托管在这些洋葱服务器上的文件,每个服务器有数百 GB。
如能帮助解决此问题,我们将不胜感激。
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
url = onionurl
try:
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
with session.get(url, stream=True, verify=False, timeout=1000000) as response:
# get the total file size
file_size = int(response.headers.get("Content-Length", 0))
print(file_size)
# get the file name
filename = dataloc
with open(filename, "wb") as text_file:
for chunk in response.iter_content(chunk_size=1024):
text_file.write(chunk)
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print("closing text file")
text_file.close()
通过简单地接受连接将终止并编写一个新函数在确切的偏移处恢复下载来设法解决它,这个问题中解释了其理论 - How to resume file download in Python?
我的代码(警告,凌乱):
def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
print("rerunning")
companyname = onionurl[0]
onionurl = onionurl[1]
dataloc = '/media/archangel/Elements/clop/dataleaks/'
foldername = dataloc
dataloc = dataloc + companyname + "/"
try:
if not os.path.isdir(dataloc):
os.mkdir(dataloc)
except Exception as e:
print(e)
print("folder not created")
filename = os.path.basename(onionurl)
filenamebasename = filename
dataloc = dataloc + filename
try:
# seconds = 20
# timeout = Timeout(seconds)
# timeout.start()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
# onionurlforrequest = "http://" + onionurl
print("dataloc")
print(dataloc)
print("onionurl")
print(onionurl)
url = onionurl
try:
print("url")
print(url)
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
#
try:
try:
seconds = 20
timeout = Timeout(seconds)
timeout.start()
except Exception as ex:
print(ex)
resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
try:
with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
#response.raise_for_status()
# get the total file size
file_size = int(response.headers['Content-Length'])
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print(file_size)
# get the file name
filename = dataloc
# filename = os.path.join(dataloc, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
# response = session.get(url, stream = True)
# progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
try:
with open(filename, "ab") as text_file:
for chunk in response.iter_content(chunk_size=1024*1024):
#https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
#if len(chunk) != 1024*36:
if chunk:
#print(len(chunk))
text_file.write(chunk)
text_file.flush()
except Exception as ex:
logging.error(f'write failed with error: {ex}')
print(ex)
#else:
# write data read to the file
# f.write(data)
# update the progress bar manually
# progress.update(len(data))
# finally, if the url is valid
#logging.info('Download finished successfully')
print("exited with for file")
except Exception as ex:
logging.error(f'Request failed with error: {ex}')
print(ex)
except Exception as ex:
logging.error(f'Attempt failed with error: {ex}')
print(ex)
print("closing text file")
# text_file.close()
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
except Exception as e:
print("FAILED DOWNLOAD 2")
print(e)
except Exception as e:
print("FAILED DOWNLOAD 5")
print(e)
def onionrequestthreadeddataleakdownload2(onionurl):
companyname = onionurl[0]
onionurl = onionurl[1]
dataloc = '/media/archangel/Elements/clop/dataleaks/'
foldername = dataloc
dataloc = dataloc + companyname + "/"
try:
if not os.path.isdir(dataloc):
os.mkdir(dataloc)
except Exception as e:
print(e)
print("folder not created")
filename = os.path.basename(onionurl)
filenamebasename = filename
dataloc = dataloc + filename
try:
# seconds = 20
# timeout = Timeout(seconds)
# timeout.start()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
# onionurlforrequest = "http://" + onionurl
print("dataloc")
print(dataloc)
print("onionurl")
print(onionurl)
url = onionurl
try:
print("url")
print(url)
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
#
try:
try:
seconds = 20
timeout = Timeout(seconds)
timeout.start()
except Exception as ex:
print(ex)
# resume_header = ({'Range': f'bytes=0-2000000'})
#file_size_online = int(r.headers.get('content-length', 0))
headersac = {'Accept-Encoding': None}
try:
with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
#response.raise_for_status()
# get the total file size
# file_size = int(response.headers.get("Content-Length", 0))
file_size = int(response.headers['Content-Length'])
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print(file_size)
#e
# get the file name
filename = dataloc
# filename = os.path.join(dataloc, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
# response = session.get(url, stream = True)
# progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
try:
with open(filename, "wb") as text_file:
for chunk in response.iter_content(chunk_size=1024*1024):
#https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
#if len(chunk) != 1024*36:
if chunk:
# print(len(chunk))
text_file.write(chunk)
text_file.flush()
except Exception as ex:
logging.error(f'write failed with error: {ex}')
print(ex)
#else:
# write data read to the file
# f.write(data)
# update the progress bar manually
# progress.update(len(data))
# finally, if the url is valid
#logging.info('Download finished successfully')
except Exception as ex:
logging.error(f'request failed with error: {ex}')
print(ex)
print("exited with for file")
#path = Path(filename)
file_size_offline = Path(filename).stat().st_size
print("file size offline")
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
print("LOOP FINISHED")
print(file_size)
print(file_size_offline)
print(filename)
except Exception as ex:
logging.error(f'Attempt failed with error: {ex}')
print(ex)
# print("closing text file")
# text_file.close()
if(file_size_offline != file_size):
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
else:
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
if(file_size_offline != file_size):
print("rerunning a final FINAL time")
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
else:
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
except Exception as e:
print("FAILED DOWNLOAD 2")
print(e)
except Exception as e:
print("FAILED DOWNLOAD 5")
print(e)