如何使用霍夫曼压缩解压缩文件?
How do I decompress files using Huffman Compression?
我已经在 python 中编写了一个 Huffman 压缩算法,但到目前为止我只设法完成了压缩部分(通过使用 heapq 创建优先级队列)。我创建了一个名为 HuffmanCoding 的 class,它接收要压缩的文本。我知道解压缩的一种方法是将包含字符及其代码的字典保存到压缩文本文件中,但我不确定我应该如何执行此操作(特别是因为我的压缩文本存储在二进制文件中)。有没有人对我如何减压有任何指示或建议?下面是我的压缩代码。
class HeapNode:
def __init__(self, char, freq):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other): # if the frequency of one character is lower than the frequency of another one
return self.freq < other.freq
def __eq__(self, other): # if two characters have the same frequencies
if other == None:
return False
if not isinstance(other, HeapNode): # checks if the character is a node or not
return False
return self.freq == other.freq
class HuffmanCoding:
def __init__(self, text_to_compress):
self.text_to_compress = text_to_compress # text that will be compressed
self.heap = []
self.codes = {} # will store the Huffman code of each character
def get_frequency(self): # method to find frequency of each character in text - RLE
frequency_Dictionary = {} # creates an empty dictionary where frequency of each character will be stored
for character in self.text_to_compress: # Iterates through the text to be compressed
if character in frequency_Dictionary:
frequency_Dictionary[character] = frequency_Dictionary[character] + 1 # if character already exists in
# dictionary, its value is increased by 1
else:
frequency_Dictionary[character] = 1 # if character is not present in list, its value is set to 1
return frequency_Dictionary
def make_queue(self, frequency): # creates the priority queue of each character and its associated frequency
for key in frequency:
node = HeapNode(key, frequency[key]) # create node (character) and store its frequency alongside it
heapq.heappush(self.heap, node) # Push the node into the heap
def merge_nodes(
self): # creates HuffmanTree by getting the two minimum nodes and merging them together, until theres
# only one node left
while len(self.heap) > 1:
node1 = heapq.heappop(self.heap) # pop node from top of heap
node2 = heapq.heappop(self.heap) # pop next node which is now at the top of heap
merged = HeapNode(None, node1.freq + node2.freq) # merge the two nodes we popped out from heap
merged.left = node1
merged.right = node2
heapq.heappush(self.heap, merged) # push merged node back into the heap
def make_codes(self, root, current_code): # Creates Huffman code for each character
if root == None:
return
if root.char != None:
self.codes[root.char] = current_code
self.make_codes(root.left, current_code + "0") # Every time you traverse left, add a 0 - Recursive Call
self.make_codes(root.right, current_code + "1") # Every time you traverse right, add a 1 - Recursive Call
def assignCodes(self): # Assigns codes to each character
root = heapq.heappop(self.heap) # extracts root node from heap
current_code = ""
self.make_codes(root, current_code)
def get_compressed_text(self, text): # Replaces characters in original text with codes
compressed_text = ""
for character in text:
compressed_text += self.codes[character]
return compressed_text
def pad_encoded_text(self, compressed_text):
extra_padding = 8 - len(compressed_text) % 8 # works out how much extra padding is required
for i in range(extra_padding):
compressed_text += "0" # adds the amount of 0's that are required
padded_info = "{0:08b}".format(extra_padding)
compressed_text = padded_info + compressed_text
return compressed_text
def make_byte_array(self, padded_text):
byte_array = bytearray()
for i in range(0, len(padded_text), 8):
byte_array.append(int(padded_text[i:i + 8], 2))
return(byte_array)
def show_compressed_text(self):
frequency = self.get_frequency()
self.make_queue(frequency)
self.merge_nodes()
self.assignCodes()
encoded_text = self.get_compressed_text(self.text_to_compress)
padded_encoded_text = self.pad_encoded_text(encoded_text)
byte_array = self.make_byte_array(padded_encoded_text)
return bytes(byte_array)
def remove_padding(self, padded_encoded_text): # removes the padding that was added
padded_info = padded_encoded_text[:8]
extra_padding = int(padded_info, 2)
padded_encoded_text = padded_encoded_text[8:]
encoded_text = padded_encoded_text[:-1 * extra_padding]
return encoded_text```
首先,您需要将霍夫曼代码的描述与数据一起发送。其次你需要在解压端使用它来解码霍夫曼码。
最直接的方法是遍历树,为遇到的每个节点发送一个 1 位,然后为每个叶子发送一个 0 位和符号位。在减压端,您可以阅读并重建树。然后你可以使用这棵树,用每一位数据向左或向右遍历,直到你到达一片叶子。发出那片叶子的符号,然后从树的根开始返回下一位。
在压缩库中通常这样做的方式是丢弃树,而是只保留和发送每个符号的代码长度,同时对其进行压缩。然后在压缩和解压缩端,根据长度构造相同的规范霍夫曼码。然后 table 查找可用于解码,以提高速度。
我已经在 python 中编写了一个 Huffman 压缩算法,但到目前为止我只设法完成了压缩部分(通过使用 heapq 创建优先级队列)。我创建了一个名为 HuffmanCoding 的 class,它接收要压缩的文本。我知道解压缩的一种方法是将包含字符及其代码的字典保存到压缩文本文件中,但我不确定我应该如何执行此操作(特别是因为我的压缩文本存储在二进制文件中)。有没有人对我如何减压有任何指示或建议?下面是我的压缩代码。
class HeapNode:
def __init__(self, char, freq):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other): # if the frequency of one character is lower than the frequency of another one
return self.freq < other.freq
def __eq__(self, other): # if two characters have the same frequencies
if other == None:
return False
if not isinstance(other, HeapNode): # checks if the character is a node or not
return False
return self.freq == other.freq
class HuffmanCoding:
def __init__(self, text_to_compress):
self.text_to_compress = text_to_compress # text that will be compressed
self.heap = []
self.codes = {} # will store the Huffman code of each character
def get_frequency(self): # method to find frequency of each character in text - RLE
frequency_Dictionary = {} # creates an empty dictionary where frequency of each character will be stored
for character in self.text_to_compress: # Iterates through the text to be compressed
if character in frequency_Dictionary:
frequency_Dictionary[character] = frequency_Dictionary[character] + 1 # if character already exists in
# dictionary, its value is increased by 1
else:
frequency_Dictionary[character] = 1 # if character is not present in list, its value is set to 1
return frequency_Dictionary
def make_queue(self, frequency): # creates the priority queue of each character and its associated frequency
for key in frequency:
node = HeapNode(key, frequency[key]) # create node (character) and store its frequency alongside it
heapq.heappush(self.heap, node) # Push the node into the heap
def merge_nodes(
self): # creates HuffmanTree by getting the two minimum nodes and merging them together, until theres
# only one node left
while len(self.heap) > 1:
node1 = heapq.heappop(self.heap) # pop node from top of heap
node2 = heapq.heappop(self.heap) # pop next node which is now at the top of heap
merged = HeapNode(None, node1.freq + node2.freq) # merge the two nodes we popped out from heap
merged.left = node1
merged.right = node2
heapq.heappush(self.heap, merged) # push merged node back into the heap
def make_codes(self, root, current_code): # Creates Huffman code for each character
if root == None:
return
if root.char != None:
self.codes[root.char] = current_code
self.make_codes(root.left, current_code + "0") # Every time you traverse left, add a 0 - Recursive Call
self.make_codes(root.right, current_code + "1") # Every time you traverse right, add a 1 - Recursive Call
def assignCodes(self): # Assigns codes to each character
root = heapq.heappop(self.heap) # extracts root node from heap
current_code = ""
self.make_codes(root, current_code)
def get_compressed_text(self, text): # Replaces characters in original text with codes
compressed_text = ""
for character in text:
compressed_text += self.codes[character]
return compressed_text
def pad_encoded_text(self, compressed_text):
extra_padding = 8 - len(compressed_text) % 8 # works out how much extra padding is required
for i in range(extra_padding):
compressed_text += "0" # adds the amount of 0's that are required
padded_info = "{0:08b}".format(extra_padding)
compressed_text = padded_info + compressed_text
return compressed_text
def make_byte_array(self, padded_text):
byte_array = bytearray()
for i in range(0, len(padded_text), 8):
byte_array.append(int(padded_text[i:i + 8], 2))
return(byte_array)
def show_compressed_text(self):
frequency = self.get_frequency()
self.make_queue(frequency)
self.merge_nodes()
self.assignCodes()
encoded_text = self.get_compressed_text(self.text_to_compress)
padded_encoded_text = self.pad_encoded_text(encoded_text)
byte_array = self.make_byte_array(padded_encoded_text)
return bytes(byte_array)
def remove_padding(self, padded_encoded_text): # removes the padding that was added
padded_info = padded_encoded_text[:8]
extra_padding = int(padded_info, 2)
padded_encoded_text = padded_encoded_text[8:]
encoded_text = padded_encoded_text[:-1 * extra_padding]
return encoded_text```
首先,您需要将霍夫曼代码的描述与数据一起发送。其次你需要在解压端使用它来解码霍夫曼码。
最直接的方法是遍历树,为遇到的每个节点发送一个 1 位,然后为每个叶子发送一个 0 位和符号位。在减压端,您可以阅读并重建树。然后你可以使用这棵树,用每一位数据向左或向右遍历,直到你到达一片叶子。发出那片叶子的符号,然后从树的根开始返回下一位。
在压缩库中通常这样做的方式是丢弃树,而是只保留和发送每个符号的代码长度,同时对其进行压缩。然后在压缩和解压缩端,根据长度构造相同的规范霍夫曼码。然后 table 查找可用于解码,以提高速度。