哈夫曼树实现文件压缩与解压缩
时间: 2023-05-29 13:03:16 浏览: 141
哈夫曼树是一种用于数据压缩的有效算法,它基于构建一棵二叉树,使得每个字符都被赋予一个唯一的编码。在这个过程中,出现频率高的字符会被分配较短的编码,而出现频率低的字符会被分配更长的编码。这样,整个数据可以被压缩为一串较短的编码,从而减少存储和传输的开销。
实现文件压缩与解压缩的基本思路如下:
1. 统计文件中每个字符的出现频率,并以此构建哈夫曼树。
2. 根据哈夫曼树给每个字符分配一个唯一的编码,并将编码表保存到压缩文件中。
3. 遍历原文件,将每个字符替换为其对应的编码,并将编码序列保存到压缩文件中。
4. 解压缩时,读取压缩文件的编码表,构建哈夫曼树,并按照编码表将编码序列转换为原始字符序列。
下面是一个简单的Python实现:
```python
import heapq
import os
# 定义节点类
class Node:
def __init__(self, freq, char=None, left=None, right=None):
self.freq = freq
self.char = char
self.left = left
self.right = right
def __lt__(self, other):
return self.freq < other.freq
# 统计字符频率
def get_freq(filename):
freq = {}
with open(filename, 'r') as f:
for line in f:
for char in line:
if char in freq:
freq[char] += 1
else:
freq[char] = 1
return freq
# 构建哈夫曼树
def build_tree(freq):
heap = [Node(freq[char], char) for char in freq]
heapq.heapify(heap)
while len(heap) > 1:
left = heapq.heappop(heap)
right = heapq.heappop(heap)
heapq.heappush(heap, Node(left.freq+right.freq, left=left, right=right))
return heap[0]
# 生成编码表
def build_code_table(root):
code_table = {}
def dfs(node, code):
if node.char is not None:
code_table[node.char] = code
else:
dfs(node.left, code+'0')
dfs(node.right, code+'1')
dfs(root, '')
return code_table
# 压缩文件
def compress(filename):
freq = get_freq(filename)
root = build_tree(freq)
code_table = build_code_table(root)
with open(filename, 'r') as f, open(filename+'.z', 'wb') as out:
# 写入编码表
out.write(str(len(code_table)).encode())
out.write(b'\n')
for char, code in code_table.items():
out.write(char.encode())
out.write(b' ')
out.write(code.encode())
out.write(b'\n')
# 写入压缩数据
bit_buffer = ''
for line in f:
for char in line:
bit_buffer += code_table[char]
while len(bit_buffer) >= 8:
byte = int(bit_buffer[:8], 2)
out.write(bytes([byte]))
bit_buffer = bit_buffer[8:]
if bit_buffer:
byte = int(bit_buffer.ljust(8, '0'), 2)
out.write(bytes([byte]))
# 解压文件
def decompress(filename):
with open(filename, 'rb') as f, open(filename[:-2], 'w') as out:
# 读取编码表
num_chars = int(f.readline().decode())
code_table = {}
for i in range(num_chars):
char, code = f.readline().decode().split()
code_table[code] = char
# 解压数据
bit_buffer = ''
while True:
byte = f.read(1)
if not byte:
break
bits = bin(ord(byte))[2:].rjust(8, '0')
bit_buffer += bits
while True:
code = ''
if bit_buffer in code_table:
code = code_table[bit_buffer]
if not code:
break
out.write(code)
bit_buffer = bit_buffer[len(code):]
# 测试压缩和解压缩
filename = 'test.txt'
compress(filename)
decompress(filename+'.z')
assert open(filename).read() == open(filename[:-2]).read()
os.remove(filename+'.z')
```
该实现中,压缩文件的后缀为“.z”,解压后的文件名与原文件相同。在压缩文件中,首先写入编码表,然后将编码序列按照8位一组转换为字节序列,并保存到压缩文件中。在解压缩时,先读取编码表,然后将压缩文件中的字节序列转换为二进制位序列,并按照编码表将二进制位序列转换为原始字符序列。
阅读全文