用哈夫曼编码实现c语言的文件压缩及注释
时间: 2024-05-05 13:22:37 浏览: 10
哈夫曼编码是一种可变长度编码,用于将字符编码为比特串,使得出现频率高的字符使用较短的比特串表示,出现频率低的字符使用较长的比特串表示。这样可以有效地压缩数据,并且不会损失数据。
在C语言中,我们可以通过读入文件,统计字符出现频率,构建哈夫曼树,生成编码表,将文件内容编码并写入压缩文件的方式来实现文件压缩。
具体实现步骤如下:
1. 定义一个字符结构体,用于存储字符及其出现频率。
```
typedef struct {
char ch;
int freq;
} CharFreq;
```
2. 统计文件中每个字符出现的频率,并将其存储到字符结构体数组中。
```
CharFreq freq[256] = {0};
char ch;
while((ch = fgetc(fp)) != EOF) {
freq[ch].freq++;
}
```
3. 构建哈夫曼树。先将字符结构体数组中出现频率不为0的字符放入优先队列中,然后不断取出出现频率最小的两个字符,构建成一个新的节点,将其加入优先队列中,直到队列中只剩一个节点,此节点即为哈夫曼树的根节点。
```
typedef struct {
CharFreq cf;
struct TreeNode *left;
struct TreeNode *right;
} TreeNode;
TreeNode *buildHuffmanTree(CharFreq freq[]) {
// 将出现频率不为0的字符放入优先队列中
PriorityQueue pq = {0};
for(int i = 0; i < 256; i++) {
if(freq[i].freq != 0) {
TreeNode *node = (TreeNode *)malloc(sizeof(TreeNode));
node->cf = freq[i];
node->left = NULL;
node->right = NULL;
enqueue(&pq, node);
}
}
// 构建哈夫曼树
while(pq.size > 1) {
TreeNode *left = dequeue(&pq);
TreeNode *right = dequeue(&pq);
TreeNode *node = (TreeNode *)malloc(sizeof(TreeNode));
node->cf.freq = left->cf.freq + right->cf.freq;
node->left = left;
node->right = right;
enqueue(&pq, node);
}
return dequeue(&pq);
}
```
4. 生成编码表。通过递归遍历哈夫曼树,生成每个字符的比特串编码,并将其存储到编码表中。
```
void generateCodeTable(TreeNode *root, char code[], int depth, char **codeTable) {
if(!root) {
return;
}
if(root->left == NULL && root->right == NULL) {
code[depth] = '\0';
codeTable[root->cf.ch] = (char *)malloc(strlen(code) + 1);
strcpy(codeTable[root->cf.ch], code);
return;
}
code[depth] = '0';
generateCodeTable(root->left, code, depth + 1, codeTable);
code[depth] = '1';
generateCodeTable(root->right, code, depth + 1, codeTable);
}
char **getCodeTable(TreeNode *root) {
char **codeTable = (char **)malloc(sizeof(char *) * 256);
char code[256] = {0};
generateCodeTable(root, code, 0, codeTable);
return codeTable;
}
```
5. 将文件内容编码并写入压缩文件。读入文件中的每个字符,通过编码表查找其对应的比特串编码,并将其写入压缩文件中。
```
void encodeFile(FILE *fp, char **codeTable, FILE *outfp) {
char ch;
char buf[256] = {0};
int bufSize = 0;
while((ch = fgetc(fp)) != EOF) {
char *code = codeTable[ch];
strcat(buf, code);
bufSize += strlen(code);
while(bufSize >= 8) {
int byte = 0;
for(int i = 0; i < 8; i++) {
byte = (byte << 1) | (buf[i] - '0');
}
fputc(byte, outfp);
strcpy(buf, buf + 8);
bufSize -= 8;
}
}
if(bufSize > 0) {
int byte = 0;
for(int i = 0; i < bufSize; i++) {
byte = (byte << 1) | (buf[i] - '0');
}
byte = byte << (8 - bufSize);
fputc(byte, outfp);
}
}
```
6. 压缩文件中加入注释。在压缩文件的开头加入注释信息,用于说明该文件的压缩方式及解压方式。
```
void addComment(FILE *outfp) {
char *comment = "This file is compressed by Huffman encoding algorithm.\nTo decompress this file, use the following command:\n\nhuffman -d compressed_file decompressed_file\n";
int len = strlen(comment);
fputc('C', outfp);
fputc('O', outfp);
fputc('M', outfp);
fwrite(&len, sizeof(int), 1, outfp);
fwrite(comment, sizeof(char), len, outfp);
}
```
完整的压缩文件的代码如下:
```
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_SIZE 256
typedef struct {
char ch;
int freq;
} CharFreq;
typedef struct {
CharFreq cf;
struct TreeNode *left;
struct TreeNode *right;
} TreeNode;
typedef struct {
TreeNode *data[MAX_SIZE];
int size;
} PriorityQueue;
void swap(TreeNode **a, TreeNode **b) {
TreeNode *temp = *a;
*a = *b;
*b = temp;
}
void enqueue(PriorityQueue *pq, TreeNode *node) {
pq->data[++pq->size] = node;
int i = pq->size;
while(i > 1 && pq->data[i]->cf.freq < pq->data[i / 2]->cf.freq) {
swap(&pq->data[i], &pq->data[i / 2]);
i /= 2;
}
}
TreeNode *dequeue(PriorityQueue *pq) {
TreeNode *node = pq->data[1];
pq->data[1] = pq->data[pq->size--];
int i = 1;
while(i * 2 <= pq->size) {
int child = i * 2;
if(child + 1 <= pq->size && pq->data[child + 1]->cf.freq < pq->data[child]->cf.freq) {
child++;
}
if(pq->data[i]->cf.freq > pq->data[child]->cf.freq) {
swap(&pq->data[i], &pq->data[child]);
i = child;
} else {
break;
}
}
return node;
}
TreeNode *buildHuffmanTree(CharFreq freq[]) {
PriorityQueue pq = {0};
for(int i = 0; i < 256; i++) {
if(freq[i].freq != 0) {
TreeNode *node = (TreeNode *)malloc(sizeof(TreeNode));
node->cf = freq[i];
node->left = NULL;
node->right = NULL;
enqueue(&pq, node);
}
}
while(pq.size > 1) {
TreeNode *left = dequeue(&pq);
TreeNode *right = dequeue(&pq);
TreeNode *node = (TreeNode *)malloc(sizeof(TreeNode));
node->cf.freq = left->cf.freq + right->cf.freq;
node->left = left;
node->right = right;
enqueue(&pq, node);
}
return dequeue(&pq);
}
void generateCodeTable(TreeNode *root, char code[], int depth, char **codeTable) {
if(!root) {
return;
}
if(root->left == NULL && root->right == NULL) {
code[depth] = '\0';
codeTable[root->cf.ch] = (char *)malloc(strlen(code) + 1);
strcpy(codeTable[root->cf.ch], code);
return;
}
code[depth] = '0';
generateCodeTable(root->left, code, depth + 1, codeTable);
code[depth] = '1';
generateCodeTable(root->right, code, depth + 1, codeTable);
}
char **getCodeTable(TreeNode *root) {
char **codeTable = (char **)malloc(sizeof(char *) * 256);
char code[256] = {0};
generateCodeTable(root, code, 0, codeTable);
return codeTable;
}
void encodeFile(FILE *fp, char **codeTable, FILE *outfp) {
char ch;
char buf[256] = {0};
int bufSize = 0;
while((ch = fgetc(fp)) != EOF) {
char *code = codeTable[ch];
strcat(buf, code);
bufSize += strlen(code);
while(bufSize >= 8) {
int byte = 0;
for(int i = 0; i < 8; i++) {
byte = (byte << 1) | (buf[i] - '0');
}
fputc(byte, outfp);
strcpy(buf, buf + 8);
bufSize -= 8;
}
}
if(bufSize > 0) {
int byte = 0;
for(int i = 0; i < bufSize; i++) {
byte = (byte << 1) | (buf[i] - '0');
}
byte = byte << (8 - bufSize);
fputc(byte, outfp);
}
}
void addComment(FILE *outfp) {
char *comment = "This file is compressed by Huffman encoding algorithm.\nTo decompress this file, use the following command:\n\nhuffman -d compressed_file decompressed_file\n";
int len = strlen(comment);
fputc('C', outfp);
fputc('O', outfp);
fputc('M', outfp);
fwrite(&len, sizeof(int), 1, outfp);
fwrite(comment, sizeof(char), len, outfp);
}
void compress(char *filename, char *outFilename) {
FILE *fp = fopen(filename, "rb");
if(!fp) {
printf("Failed to open file %s.\n", filename);
return;
}
CharFreq freq[256] = {0};
char ch;
while((ch = fgetc(fp)) != EOF) {
freq[ch].freq++;
}
TreeNode *root = buildHuffmanTree(freq);
char **codeTable = getCodeTable(root);
FILE *outfp = fopen(outFilename, "wb");
addComment(outfp);
fwrite(freq, sizeof(CharFreq), 256, outfp);
encodeFile(fp, codeTable, outfp);
fclose(fp);
fclose(outfp);
}
int main(int argc, char *argv[]) {
if(argc != 3) {
printf("Usage: %s filename outFilename\n", argv[0]);
return 0;
}
compress(argv[1], argv[2]);
return 0;
}
```