对一个给定的文本文件,对其进行哈夫曼编码,并计算压缩率。
时间: 2024-05-16 07:15:04 浏览: 162
哈夫曼编码是一种无损数据压缩算法,它根据不同字符出现的频率,分配不同长度的编码来压缩数据。压缩率取决于文本中不同字符出现的频率。如果文本中存在大量重复的字符,则压缩率会更高。
下面是对文本文件进行哈夫曼编码的步骤:
1. 统计文本中不同字符出现的频率,并按照频率从小到大排序。
2. 构建哈夫曼树。将频率最小的两个字符合并成一个节点,其权重为它们的频率之和。重复此过程,直到所有节点合并为一棵树。
3. 给哈夫曼树的左分支编码为0,右分支编码为1。从根节点开始,沿着树走到每个叶子节点,记录下路径上的编码,即为该字符的哈夫曼编码。
4. 将文本中每个字符替换为它的哈夫曼编码,并将多个编码拼接成一个二进制字符串。
5. 将二进制字符串转换为字节数组,并将其写入到输出文件中。
6. 计算压缩率。压缩率 = 原始文件大小 / 压缩后文件大小。
下面是 Python 实现:
```python
import heapq, os
class HuffmanCoding:
def __init__(self, file_path):
self.file_path = file_path
self.heap = []
self.codes = {}
self.reverse_mapping = {}
class HeapNode:
def __init__(self, char, freq):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other):
return self.freq < other.freq
def __eq__(self, other):
if(other == None):
return False
if(not isinstance(other, HeapNode)):
return False
return self.freq == other.freq
def make_frequency_dict(self, text):
frequency = {}
for char in text:
if char not in frequency:
frequency[char] = 0
frequency[char] += 1
return frequency
def make_heap(self, frequency):
for key in frequency:
node = self.HeapNode(key, frequency[key])
heapq.heappush(self.heap, node)
def merge_nodes(self):
while(len(self.heap)>1):
node1 = heapq.heappop(self.heap)
node2 = heapq.heappop(self.heap)
merged = self.HeapNode(None, node1.freq + node2.freq)
merged.left = node1
merged.right = node2
heapq.heappush(self.heap, merged)
def make_codes_helper(self, root, current_code):
if(root == None):
return
if(root.char != None):
self.codes[root.char] = current_code
self.reverse_mapping[current_code] = root.char
return
self.make_codes_helper(root.left, current_code + "0")
self.make_codes_helper(root.right, current_code + "1")
def make_codes(self):
root = heapq.heappop(self.heap)
current_code = ""
self.make_codes_helper(root, current_code)
def get_encoded_text(self, text):
encoded_text = ""
for char in text:
encoded_text += self.codes[char]
return encoded_text
def pad_encoded_text(self, encoded_text):
extra_padding = 8 - len(encoded_text) % 8
for i in range(extra_padding):
encoded_text += "0"
padded_info = "{0:08b}".format(extra_padding)
encoded_text = padded_info + encoded_text
return encoded_text
def get_byte_array(self, padded_encoded_text):
if(len(padded_encoded_text) % 8 != 0):
print("Encoded text not padded properly")
exit(0)
b = bytearray()
for i in range(0, len(padded_encoded_text), 8):
byte = padded_encoded_text[i:i+8]
b.append(int(byte, 2))
return b
def compress(self):
file_name, file_extension = os.path.splitext(self.file_path)
output_path = file_name + ".bin"
with open(self.file_path, 'r+') as file, open(output_path, 'wb') as output:
text = file.read()
text = text.rstrip()
frequency = self.make_frequency_dict(text)
self.make_heap(frequency)
self.merge_nodes()
self.make_codes()
encoded_text = self.get_encoded_text(text)
padded_encoded_text = self.pad_encoded_text(encoded_text)
b = self.get_byte_array(padded_encoded_text)
output.write(bytes(b))
print("Compressed file path: " + output_path)
return output_path
""" functions for decompression """
def remove_padding(self, padded_encoded_text):
padded_info = padded_encoded_text[:8]
extra_padding = int(padded_info, 2)
padded_encoded_text = padded_encoded_text[8:]
encoded_text = padded_encoded_text[:-1*extra_padding]
return encoded_text
def decode_text(self, encoded_text):
current_code = ""
decoded_text = ""
for bit in encoded_text:
current_code += bit
if(current_code in self.reverse_mapping):
character = self.reverse_mapping[current_code]
decoded_text += character
current_code = ""
return decoded_text
def decompress(self, input_path):
file_name, file_extension = os.path.splitext(self.file_path)
output_path = file_name + "_decompressed" + ".txt"
with open(input_path, 'rb') as file, open(output_path, 'w') as output:
bit_string = ""
byte = file.read(1)
while(byte != b""):
byte = ord(byte)
bits = bin(byte)[2:].rjust(8, '0')
bit_string += bits
byte = file.read(1)
encoded_text = self.remove_padding(bit_string)
decoded_text = self.decode_text(encoded_text)
output.write(decoded_text)
print("Decompressed file path: " + output_path)
return output_path
```
使用示例:
```python
h = HuffmanCoding("example.txt")
output_path = h.compress()
h.decompress(output_path)
```
其中 `example.txt` 是需要压缩的文本文件。该程序会在同一目录下生成一个名为 `example.bin` 的压缩文件,以及一个名为 `example_decompressed.txt` 的解压缩文件。压缩率可以通过比较原始文件大小和压缩文件大小来计算。
阅读全文