用c++写一个哈夫曼树,可以读取输入3个TXT文件,并根据文件构造哈夫曼编码,把最优写入输出文件,代码要有注释。
时间: 2024-06-09 15:11:40 浏览: 120
首先,哈夫曼编码是一种无损压缩算法,可以通过构造哈夫曼树来实现。哈夫曼树是一种二叉树,它的左子树表示0,右子树表示1,每个叶子节点都对应一个字符,它的路径就是该字符的哈夫曼编码。
下面是 C 语言实现哈夫曼树的代码,可以读取输入3个 TXT 文件,并根据文件构造哈夫曼编码,把最优写入输出文件:
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_SIZE 256
#define MAX_BIT 8
typedef struct node {
char data;
int freq;
struct node *left;
struct node *right;
} Node;
typedef struct {
int size;
Node *data[MAX_SIZE];
} Heap;
typedef struct {
char code[MAX_SIZE][MAX_BIT];
int size;
} CodeTable;
void init_heap(Heap *heap)
{
heap->size = 0;
}
void push_heap(Heap *heap, Node *node)
{
int i = heap->size++;
int j = (i - 1) / 2;
while (i > 0 && heap->data[j]->freq > node->freq) {
heap->data[i] = heap->data[j];
i = j;
j = (i - 1) / 2;
}
heap->data[i] = node;
}
Node *pop_heap(Heap *heap)
{
Node *node = heap->data[0];
Node *last = heap->data[--heap->size];
int i = 0;
int j = 1;
while (j < heap->size) {
if (j + 1 < heap->size && heap->data[j + 1]->freq < heap->data[j]->freq) {
j++;
}
if (last->freq <= heap->data[j]->freq) {
break;
}
heap->data[i] = heap->data[j];
i = j;
j = i * 2 + 1;
}
heap->data[i] = last;
return node;
}
Node *build_tree(Heap *heap)
{
while (heap->size > 1) {
Node *left = pop_heap(heap);
Node *right = pop_heap(heap);
Node *node = (Node *)malloc(sizeof(Node));
node->data = 0;
node->freq = left->freq + right->freq;
node->left = left;
node->right = right;
push_heap(heap, node);
}
return heap->data[0];
}
void encode(Node *root, CodeTable *table, char code[], int depth)
{
if (root->left == NULL && root->right == NULL) {
memcpy(table->code[root->data], code, depth);
table->size++;
return;
}
code[depth] = '0';
encode(root->left, table, code, depth + 1);
code[depth] = '1';
encode(root->right, table, code, depth + 1);
}
void print_tree(Node *root, int depth)
{
if (root == NULL) {
return;
}
for (int i = 0; i < depth; i++) {
printf(" ");
}
if (root->left == NULL && root->right == NULL) {
printf("'%c' (%d)\n", root->data, root->freq);
} else {
printf("(%d)\n", root->freq);
print_tree(root->left, depth + 1);
print_tree(root->right, depth + 1);
}
}
void print_table(CodeTable *table)
{
for (int i = 0; i < table->size; i++) {
printf("'%c': %s\n", i, table->code[i]);
}
}
void write_tree(Node *root, FILE *out)
{
if (root == NULL) {
return;
}
if (root->left == NULL && root->right == NULL) {
fputc('L', out);
fputc(root->data, out);
} else {
fputc('N', out);
write_tree(root->left, out);
write_tree(root->right, out);
}
}
void write_table(CodeTable *table, FILE *out)
{
fputc(table->size, out);
for (int i = 0; i < table->size; i++) {
fputc(i, out);
fputc(strlen(table->code[i]), out);
fwrite(table->code[i], strlen(table->code[i]), 1, out);
}
}
void read_tree(Node **root, FILE *in)
{
int type = fgetc(in);
if (type == 'L') {
char data = fgetc(in);
Node *node = (Node *)malloc(sizeof(Node));
node->data = data;
node->freq = 0;
node->left = NULL;
node->right = NULL;
*root = node;
} else if (type == 'N') {
Node *left = NULL;
Node *right = NULL;
read_tree(&left, in);
read_tree(&right, in);
Node *node = (Node *)malloc(sizeof(Node));
node->data = 0;
node->freq = 0;
node->left = left;
node->right = right;
*root = node;
}
}
void read_table(CodeTable *table, FILE *in)
{
table->size = fgetc(in);
for (int i = 0; i < table->size; i++) {
int index = fgetc(in);
int length = fgetc(in);
fread(table->code[index], length, 1, in);
table->code[index][length] = '\0';
}
}
void encode_file(char *filename, CodeTable *table, char *outname)
{
FILE *in = fopen(filename, "rb");
FILE *out = fopen(outname, "wb");
char bit = 0;
int len = 0;
int ch = 0;
while ((ch = fgetc(in)) != EOF) {
char *code = table->code[ch];
for (int i = 0; i < strlen(code); i++) {
bit |= (code[i] - '0') << (7 - len);
len++;
if (len == 8) {
fputc(bit, out);
bit = 0;
len = 0;
}
}
}
if (len > 0) {
fputc(bit, out);
}
fclose(in);
fclose(out);
}
void decode_file(Node *root, char *filename, char *outname)
{
FILE *in = fopen(filename, "rb");
FILE *out = fopen(outname, "wb");
Node *node = root;
int bit = 0;
int len = 0;
int ch = 0;
while ((ch = fgetc(in)) != EOF) {
for (int i = 0; i < 8; i++) {
if (ch & (1 << (7 - i))) {
node = node->right;
} else {
node = node->left;
}
if (node->left == NULL && node->right == NULL) {
fputc(node->data, out);
node = root;
}
}
}
fclose(in);
fclose(out);
}
int main()
{
Heap heap;
CodeTable table;
Node *root;
init_heap(&heap);
// 读取输入文件
FILE *in1 = fopen("input1.txt", "rb");
FILE *in2 = fopen("input2.txt", "rb");
FILE *in3 = fopen("input3.txt", "rb");
int freq[MAX_SIZE] = {0};
// 统计字符频率
int ch;
while ((ch = fgetc(in1)) != EOF) {
freq[ch]++;
}
while ((ch = fgetc(in2)) != EOF) {
freq[ch]++;
}
while ((ch = fgetc(in3)) != EOF) {
freq[ch]++;
}
fclose(in1);
fclose(in2);
fclose(in3);
// 构造哈夫曼树
for (int i = 0; i < MAX_SIZE; i++) {
if (freq[i] > 0) {
Node *node = (Node *)malloc(sizeof(Node));
node->data = i;
node->freq = freq[i];
node->left = NULL;
node->right = NULL;
push_heap(&heap, node);
}
}
root = build_tree(&heap);
// 构造哈夫曼编码表
encode(root, &table, "", 0);
// 输出哈夫曼树和编码表
printf("Huffman tree:\n");
print_tree(root, 0);
printf("Code table:\n");
print_table(&table);
// 写入输出文件
FILE *out1 = fopen("output1.bin", "wb");
FILE *out2 = fopen("output2.bin", "wb");
FILE *out3 = fopen("output3.bin", "wb");
write_tree(root, out1);
write_table(&table, out1);
fclose(out1);
// 对输入文件进行编码
encode_file("input1.txt", &table, "output1.bin");
encode_file("input2.txt", &table, "output2.bin");
encode_file("input3.txt", &table, "output3.bin");
// 读取输出文件
FILE *in = fopen("output1.bin", "rb");
read_tree(&root, in);
read_table(&table, in);
fclose(in);
// 解码输出文件
decode_file(root, "output1.bin", "decode1.txt");
decode_file(root, "output2.bin", "decode2.txt");
decode_file(root, "output3.bin", "decode3.txt");
return 0;
}
```
上述代码中,我们首先定义了一个 `Node` 结构体来表示哈夫曼树的节点,包括字符数据、出现频率以及左右子树指针。然后定义了一个 `Heap` 结构体来表示最小堆,用来构造哈夫曼树。我们使用最小堆来维护节点的频率,每次取出频率最小的两个节点合并成一个新节点,直到只剩下一个节点为止,这个节点就是哈夫曼树的根节点。
我们定义了一个 `CodeTable` 结构体来表示字符的编码表,其中包含了每个字符的编码以及编码的长度。我们可以通过递归遍历哈夫曼树来构造编码表。
在主函数中,我们首先读取输入文件,然后统计字符频率,使用最小堆构造哈夫曼树,然后构造编码表,输出哈夫曼树和编码表。接着,我们写入输出文件,对输入文件进行编码,然后读取输出文件,构造哈夫曼树和编码表,对输出文件进行解码,最后写入解码后的文件。
值得注意的是,我们在写入输出文件时,将哈夫曼树和编码表分别写入,以便解码时使用。在哈夫曼树的表示中,我们使用 L 表示叶子节点,后面跟一个字符,表示该叶子节点对应的字符;使用 N 表示非叶子节点,后面跟左子树和右子树的表示。在编码表的表示中,我们先写入编码表的大小,然后对于每个字符,写入它的 ASCII 码、编码长度以及编码本身。
最后,我们对输入文件进行编码和解码时,都是一次读入一个字符,然后根据编码表查找对应的编码,将编码写入输出文件或者根据编码解码输出文件。由于每个字符的编码长度不同,我们需要使用一个比特位变量来保存编码,当比特位变量满了8个比特时,就输出一个字节,然后清空比特位变量。
阅读全文