如何使用霍夫曼压缩解压缩文件?

How do I decompress files using Huffman Compression?

我已经在 python 中编写了一个 Huffman 压缩算法,但到目前为止我只设法完成了压缩部分(通过使用 heapq 创建优先级队列)。我创建了一个名为 HuffmanCoding 的 class,它接收要压缩的文本。我知道解压缩的一种方法是将包含字符及其代码的字典保存到压缩文本文件中,但我不确定我应该如何执行此操作(特别是因为我的压缩文本存储在二进制文件中)。有没有人对我如何减压有任何指示或建议?下面是我的压缩代码。



class HeapNode:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def __lt__(self, other):  # if the frequency of one character is lower than the frequency of another one
        return self.freq < other.freq

    def __eq__(self, other):  # if two characters have the same frequencies
        if other == None:
            return False
        if not isinstance(other, HeapNode):  # checks if the character is a node or not
            return False
        return self.freq == other.freq


class HuffmanCoding:
    def __init__(self, text_to_compress):
        self.text_to_compress = text_to_compress  # text that will be compressed
        self.heap = []
        self.codes = {}  # will store the Huffman code of each character
        

    def get_frequency(self):  # method to find frequency of each character in text - RLE
        frequency_Dictionary = {}  # creates an empty dictionary where frequency of each character will be stored

        for character in self.text_to_compress:  # Iterates through the text to be compressed
            if character in frequency_Dictionary:
                frequency_Dictionary[character] = frequency_Dictionary[character] + 1  # if character already exists in
                # dictionary, its value is increased by 1
            else:
                frequency_Dictionary[character] = 1  # if character is not present in list, its value is set to 1

        return frequency_Dictionary

    def make_queue(self, frequency):  # creates the priority queue of each character and its associated frequency
        for key in frequency:
            node = HeapNode(key, frequency[key])  # create node (character) and store its frequency alongside it
            heapq.heappush(self.heap, node)  # Push the node into the heap

    def merge_nodes(
            self):  # creates HuffmanTree by getting the two minimum nodes and merging them together, until theres
        # only one node left
        while len(self.heap) > 1:
            node1 = heapq.heappop(self.heap)  # pop node from top of heap
            node2 = heapq.heappop(self.heap)  # pop next node which is now at the top of heap

            merged = HeapNode(None, node1.freq + node2.freq)  # merge the two nodes we popped out from heap
            merged.left = node1
            merged.right = node2

            heapq.heappush(self.heap, merged)  # push merged node back into the heap

    def make_codes(self, root, current_code):  # Creates Huffman code for each character
        if root == None:
            return

        if root.char != None:
            self.codes[root.char] = current_code
            

        self.make_codes(root.left, current_code + "0")  # Every time you traverse left, add a 0 - Recursive Call
        self.make_codes(root.right, current_code + "1")  # Every time you traverse right, add a 1 - Recursive Call

    def assignCodes(self):  # Assigns codes to each character
        root = heapq.heappop(self.heap)  # extracts root node from heap
        current_code = ""
        self.make_codes(root, current_code)

    def get_compressed_text(self, text):  # Replaces characters in original text with codes
        compressed_text = ""
        for character in text:
            compressed_text += self.codes[character]
        return compressed_text

    def pad_encoded_text(self, compressed_text):
        extra_padding = 8 - len(compressed_text) % 8  # works out how much extra padding is required
        for i in range(extra_padding):
            compressed_text += "0"  # adds the amount of 0's that are required

        padded_info = "{0:08b}".format(extra_padding)
        compressed_text = padded_info + compressed_text
        return compressed_text

    def make_byte_array(self, padded_text):

        byte_array = bytearray()
        for i in range(0, len(padded_text), 8):
            byte_array.append(int(padded_text[i:i + 8], 2))

        return(byte_array)

    def show_compressed_text(self):

        frequency = self.get_frequency()
        self.make_queue(frequency)
        self.merge_nodes()
        self.assignCodes()

        encoded_text = self.get_compressed_text(self.text_to_compress)
        padded_encoded_text = self.pad_encoded_text(encoded_text)

        byte_array = self.make_byte_array(padded_encoded_text)
        return bytes(byte_array)

    def remove_padding(self, padded_encoded_text):  # removes the padding that was added
        padded_info = padded_encoded_text[:8]
        extra_padding = int(padded_info, 2)

        padded_encoded_text = padded_encoded_text[8:]
        encoded_text = padded_encoded_text[:-1 * extra_padding]

        return encoded_text``` 

首先,您需要将霍夫曼代码的描述与数据一起发送。其次你需要在解压端使用它来解码霍夫曼码。

最直接的方法是遍历树,为遇到的每个节点发送一个 1 位,然后为每个叶子发送一个 0 位和符号位。在减压端,您可以阅读并重建树。然后你可以使用这棵树,用每一位数据向左或向右遍历,直到你到达一片叶子。发出那片叶子的符号,然后从树的根开始返回下一位。

在压缩库中通常这样做的方式是丢弃树,而是只保留和发送每个符号的代码长度,同时对其进行压缩。然后在压缩和解压缩端,根据长度构造相同的规范霍夫曼码。然后 table 查找可用于解码,以提高速度。