import heapq from functools import total_ordering """ Code for Huffman Coding, compression and decompression. Explanation at http://bhrigu.me/blog/2017/01/17/huffman-coding-python-implementation/ Adapted from https://github.com/bhrigu123/huffman-coding """ @total_ordering class HeapNode: def __init__(self, token, freq): self.token = token self.freq = freq self.left = None self.right = None # defining comparators less_than and equals def __lt__(self, other): return self.freq < other.freq def __eq__(self, other): if(other == None): return False if(not isinstance(other, HeapNode)): return False return self.freq == other.freq class HuffmanCoding: def __init__(self): self.heap = [] self.codes = {} self.reverse_mapping = {} # functions for compression: def make_heap(self, frequency): for key in frequency: node = HeapNode(key, frequency[key]) heapq.heappush(self.heap, node) def make_heap_from_array(self, freqs): for index in range(len(freqs)): node = HeapNode(index, freqs[index]) heapq.heappush(self.heap, node) def merge_nodes(self): while(len(self.heap)>1): node1 = heapq.heappop(self.heap) node2 = heapq.heappop(self.heap) merged = HeapNode(None, node1.freq + node2.freq) merged.left = node1 merged.right = node2 heapq.heappush(self.heap, merged) def make_codes_helper(self, root, current_code): if(root == None): return if(root.token != None): self.codes[root.token] = current_code self.reverse_mapping[current_code] = root.token return self.make_codes_helper(root.left, current_code + "0") self.make_codes_helper(root.right, current_code + "1") def make_codes(self): root = heapq.heappop(self.heap) current_code = "" self.make_codes_helper(root, current_code) return root def get_encoded_tokens(self, token_list): encoded_text = "" for token in token_list: encoded_text += self.codes[token] return encoded_text def decode_text(self, encoded_text): current_code = "" decoded_text = "" for bit in encoded_text: current_code += bit if(current_code in self.reverse_mapping): character = self.reverse_mapping[current_code] decoded_text += character current_code = "" return decoded_text def decompress(self, input_path): filename, file_extension = os.path.splitext(self.path) output_path = filename + "_decompressed" + ".txt" with open(input_path, 'rb') as file, open(output_path, 'w') as output: bit_string = "" byte = file.read(1) while(len(byte) > 0): byte = ord(byte) bits = bin(byte)[2:].rjust(8, '0') bit_string += bits byte = file.read(1) encoded_text = self.remove_padding(bit_string) decompressed_text = self.decode_text(encoded_text) output.write(decompressed_text) return output_path