Python 通过 Tor 请求流 - 连接终止

Question

我正在使用 python 请求库以多处理方式在 'onionurl' 下载文件，从 tor 服务下载大量文件。

这就是代码背后的原因。

然而，在下载这些文件时，它们会在一两分钟后中断。由于在流中下载失败，没有给出错误，但返回 'closing text file'。这意味着不可能下载托管在这些洋葱服务器上的文件，每个服务器有数百 GB。

如能帮助解决此问题，我们将不胜感激。

    session = requests.session()
    session.proxies = {}
    session.proxies['http'] = 'socks5h://localhost:9050'
    session.proxies['https'] = 'socks5h://localhost:9050'
    #print(onionurlforrequest)
    

    url = onionurl

    try:
        if not os.path.isdir(foldername):
            os.makedirs(foldername)
        # download the body of response by chunk, not immediately
        with session.get(url, stream=True, verify=False, timeout=1000000) as response:
            # get the total file size
            file_size = int(response.headers.get("Content-Length", 0))
            print(file_size)
            # get the file name

            filename = dataloc


            with open(filename, "wb") as text_file: 
                for chunk in response.iter_content(chunk_size=1024):

                    text_file.write(chunk)
 
                    if (file_size  > 1000000):
                        filesizemb = file_size / 1000000
                    else:
                        filesizemb = 1
            print("closing text file")
            text_file.close()

Answer 1

通过简单地接受连接将终止并编写一个新函数在确切的偏移处恢复下载来设法解决它，这个问题中解释了其理论 - How to resume file download in Python?

我的代码（警告，凌乱）：

def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
    print("rerunning")
    companyname = onionurl[0]
    onionurl = onionurl[1]
    dataloc = '/media/archangel/Elements/clop/dataleaks/'
    foldername = dataloc
    dataloc = dataloc + companyname + "/"
    try:
       if not os.path.isdir(dataloc):

           os.mkdir(dataloc)
    except Exception as e:

        print(e)
        print("folder not created")


    filename = os.path.basename(onionurl)
    filenamebasename = filename



    dataloc = dataloc + filename

    try:
 #       seconds = 20
  #      timeout = Timeout(seconds)
   #     timeout.start()



        session = requests.session()
        session.proxies = {}
        session.proxies['http'] = 'socks5h://localhost:9050'
        session.proxies['https'] = 'socks5h://localhost:9050'
        #print(onionurlforrequest)
        
      #  onionurlforrequest = "http://" + onionurl
        print("dataloc")
        print(dataloc)
        print("onionurl")
        print(onionurl)
        url = onionurl

        try:
            print("url")
            print(url)
            if not os.path.isdir(foldername):
                os.makedirs(foldername)
            # download the body of response by chunk, not immediately
#
            try:
                try:
                    seconds = 20
                    timeout = Timeout(seconds)
                    timeout.start()
                except Exception as ex:
                    print(ex)

                resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
                try:
                    with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
                        #response.raise_for_status()

                        # get the total file size
                        file_size = int(response.headers['Content-Length'])
                        if (file_size  > 1000000):
                            filesizemb = file_size / 1000000
                        else:
                            filesizemb = 1
                        print(file_size)
                        # get the file name

                        filename = dataloc
            #            filename = os.path.join(dataloc, url.split("/")[-1])
                        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
             #           response = session.get(url, stream = True)
            #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                        try:
                            with open(filename, "ab") as text_file: 
                                for chunk in response.iter_content(chunk_size=1024*1024):
                                    #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                    #if len(chunk) != 1024*36:
                                    if chunk: 
                                        #print(len(chunk))
                                        text_file.write(chunk)
                                        text_file.flush()
                        except Exception as ex:
                            logging.error(f'write failed with error: {ex}')
                            print(ex)
                                #else:
                                
                                    # write data read to the file
                #                    f.write(data)
                                    # update the progress bar manually
                 #                   progress.update(len(data))
                                # finally, if the url is valid

                        #logging.info('Download finished successfully')

                        print("exited with for file")
                except Exception as ex:
                    logging.error(f'Request failed with error: {ex}')
                    print(ex)

            except Exception as ex:
                logging.error(f'Attempt failed with error: {ex}')
                print(ex)

            print("closing text file")
          #  text_file.close()

                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize

        except Exception as e:
            print("FAILED DOWNLOAD 2")

            print(e)
    except Exception as e:
        print("FAILED DOWNLOAD 5")
        print(e)












def onionrequestthreadeddataleakdownload2(onionurl):
    companyname = onionurl[0]
    onionurl = onionurl[1]
    dataloc = '/media/archangel/Elements/clop/dataleaks/'
    foldername = dataloc
    dataloc = dataloc + companyname + "/"
    try:
       if not os.path.isdir(dataloc):

           os.mkdir(dataloc)
    except Exception as e:

        print(e)
        print("folder not created")


    filename = os.path.basename(onionurl)
    filenamebasename = filename



    dataloc = dataloc + filename

    try:
 #       seconds = 20
  #      timeout = Timeout(seconds)
   #     timeout.start()



        session = requests.session()
        session.proxies = {}
        session.proxies['http'] = 'socks5h://localhost:9050'
        session.proxies['https'] = 'socks5h://localhost:9050'
        #print(onionurlforrequest)
        
      #  onionurlforrequest = "http://" + onionurl
        print("dataloc")
        print(dataloc)
        print("onionurl")
        print(onionurl)
        url = onionurl

        try:
            print("url")
            print(url)
            if not os.path.isdir(foldername):
                os.makedirs(foldername)
            # download the body of response by chunk, not immediately
#
            try:
                try:
                    seconds = 20
                    timeout = Timeout(seconds)
                    timeout.start()
                except Exception as ex:
                    print(ex)

               # resume_header = ({'Range': f'bytes=0-2000000'})
                #file_size_online = int(r.headers.get('content-length', 0))
                headersac = {'Accept-Encoding': None}
                try:
                    with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
                        #response.raise_for_status()

                        # get the total file size
    #                    file_size = int(response.headers.get("Content-Length", 0))
                        file_size = int(response.headers['Content-Length'])
                        if (file_size  > 1000000):
                            filesizemb = file_size / 1000000
                        else:
                            filesizemb = 1
                        print(file_size)
                        #e
                        # get the file name

                        filename = dataloc
            #            filename = os.path.join(dataloc, url.split("/")[-1])
                        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
             #           response = session.get(url, stream = True)
            #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                        try:
                            with open(filename, "wb") as text_file: 
                                for chunk in response.iter_content(chunk_size=1024*1024):
                                    #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                    #if len(chunk) != 1024*36:
                                    if chunk: 
                                       # print(len(chunk))
                                        text_file.write(chunk)
                                        text_file.flush()
                        except Exception as ex:
                            logging.error(f'write failed with error: {ex}')
                            print(ex)
                                #else:
                                
                                    # write data read to the file
                #                    f.write(data)
                                    # update the progress bar manually
                 #                   progress.update(len(data))
                                # finally, if the url is valid

                        #logging.info('Download finished successfully')
                except Exception as ex:
                    logging.error(f'request failed with error: {ex}')
                    print(ex)
                    print("exited with for file")
                #path = Path(filename)
                file_size_offline = Path(filename).stat().st_size
                print("file size offline")
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
                print("LOOP FINISHED")

                print(file_size)
                print(file_size_offline)
                print(filename)
            except Exception as ex:
                logging.error(f'Attempt failed with error: {ex}')
                print(ex)

#            print("closing text file")
          #  text_file.close()
            if(file_size_offline != file_size):
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
            else:
                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
            if(file_size_offline != file_size):
                print("rerunning a final FINAL time")
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
            else:
                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
                


            returnedlist = []
            returnedlist.append(dataloc)
            returnedlist.append(filenamebasename)
            returnedlist.append(url)
            returnedlist.append(filesizemb)
            return returnedlist
        except Exception as e:
            print("FAILED DOWNLOAD 2")

            print(e)
    except Exception as e:
        print("FAILED DOWNLOAD 5")
        print(e)

Python 通过 Tor 请求流 - 连接终止

Python Requests Stream Via Tor - Connection Dies

python

tor

multiprocessing

python-3.x

python-requests