分段下载期间损坏的二进制文件

Binary files corrupted during segmented download

所以我正在尝试使用 Python 2.7

编写一个非常简单的 Internet 下载管理器欺骗代码

它应该查询文件 HTTP header,获取字节范围并根据字节在 no.of 个线程(为简单起见,我 hard-coded 2)中分散下载范围,然后再次将文件部分连接在一起。

该脚本设法轻松下载 csv 文件和文本文件,而不会丢失文件的完整性。 MD5校验和保持不变

问题是对于有一点复杂的文件,例如 bin 文件、zip 文件、视频文件和音乐文件,由于某种原因会丢失完整性。我认为字节的顺序被打乱了。

示例:

  1. mp3 downloaded through Chrome

  2. mp3 downloaded through my script with 2 threads

Python 来源:

from __future__ import print_function

import threading
import urllib
import urllib2

import time

threads = []
parts = {}

# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)

# define file
file_name = "test.mp3"
f = open(file_name, 'wb')


# open url and get header info
def get_file_size(url):
    stream_size =  u.info()['Content-Length']
    file_size = stream_size
    return file_size

start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512

#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
    full_stream_size = end
    first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
    print(first_thread)
    return first_thread

#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
    full_stream_size = end
    second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
    print(second_thread)
    return second_thread



# download function
def download_thread(url ,id,start,end):
    current_size = int(float(start)/1024)
    total_size = int(float(end)/1024)
    print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))

    # specify request range and init stream
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (start, end)

    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break
        start += len(buffer)
        f.write(buffer)
        thread_id = id
        status =  "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
        print (status)

#starts 2 threads
def start_threads():
    for i in range(2):
        #if first loop, start thread 1
        if(i==0):
            start = calculate_no_of_bytes_for_thread1().get('start')
            end = calculate_no_of_bytes_for_thread1().get('end')
            print("Thread 1 started")
            t = threading.Thread(target=download_thread, args=(url,i,start,end))
            t.start()
            threads.append( t)
        #if second loop, start thread 2
        if(i==1):
            start = calculate_no_of_bytes_for_thread2().get('start')
            end = calculate_no_of_bytes_for_thread2().get('end')
            print("Thread 2 started")
            t = threading.Thread(target=download_thread, args=(url,i,start,end))
            t.start()
            threads.append( t)

    # Join threads back (order doesn't matter, you just want them all)
    for i in threads:
       i.join()

    # Sort parts and you're done
    # result = ''
    # for i in range(2):
    #     result += parts[i*block_sz]

#start benchmarking
start_time = time.clock()

start_threads()

print ("Finito!")

end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)

f.close()

所以在 Mark 的一些见解之后,(谢谢)我让脚本工作并且它完美地下载了文件。我了解到每个字节都很重要!所以这是 工作代码 :

import urllib
import urllib2
import threading
import time

f = open("newfile.zip", "wb")
url = "http://greenbookhymns.s3.amazonaws.com/245to257.zip"
parts = {}
threads = []

#gets file size
d = urllib.urlopen(url)
file_size = d.info()['Content-Length']
print ("File Size = " + str(file_size))

#get thread_no
thread_no = int(file_size) / 1000000

#urllib2 range download function
def download(thread_no,start_point, end_point):
    req = urllib2.Request(url)
    req.headers['Range'] = 'bytes=%s-%s' % (start_point, end_point)
    f = urllib2.urlopen(req)
    parts[thread_no] = f.read()

#launch threads targeting download function
def thread_launcher(thread_no):
    for i in range(thread_no):
        if i == 0:
            t = threading.Thread(target=download, args=(i,0,1000000,))
            t.start()
            threads.append( t)
            print "iteration 0starting_point0ending_point1000000"
        elif i > 0:
            start_point = (i*1000000)+1
            end_point = (i*1000000)+1000000
            t = threading.Thread(target=download, args=(i,start_point,end_point,))
            t.start()
            threads.append( t)
            print "iteration" + str(i) + "starting_point" + str(start_point) + "end_point" + str(end_point)

        last_file_part_start_point = (thread_no * 1000000) +1
        remaining_bytes= int(file_size) - int(thread_no*1000000)
        print str(remaining_bytes)
        last_file_part_end_point = (thread_no*1000000) + remaining_bytes
        print "iteration" + str(thread_no) + "starting_point" + str(last_file_part_start_point) + "end_point" + str(last_file_part_end_point)
        t = threading.Thread(target=download, args=(thread_no,(last_file_part_start_point), last_file_part_end_point,))
        t.start()
        threads.append(t)

thread_launcher(thread_no)

# Join threads back (order doesn't matter, you just want them all)
for i in threads:
    i.join()

# Sort parts and you're done
result = ''
for i in range(thread_no+1):
    result += parts[i]

f.write(result)

f.close()

exit()

您的下载线程函数在收到数据时将数据写入文件 f。您有两个并行线程 运行,因此它会将接收到的数据混杂到文件中。