用C语言实现一个完整的程序,该程序能够对一个英文文本文件,统计文件中所有不同单词的个数,以及词频最大的前10%的单词。并加上备注
时间: 2024-05-15 19:14:21 浏览: 184
以下是基于C语言的实现代码:
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAX_WORD_LEN 100
#define MAX_LINE_LEN 1000
typedef struct Word {
char* value;
int frequency;
} Word;
int cmp(const void* a, const void* b) {
return (*(Word**)b)->frequency - (*(Word**)a)->frequency;
}
// 将字符串全部转换为小写
void str_to_lower(char* str) {
while (*str) {
*str = tolower(*str);
str++;
}
}
// 判断一个字符是否为字母
int is_letter(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
// 从一行文本中提取单词
int extract_words(char* line, Word** words, int* word_count, int* word_capacity) {
char* p = line;
while (*p) {
if (is_letter(*p)) {
int word_start = p - line;
int word_len = 0;
while (*p && is_letter(*p)) {
word_len++;
p++;
}
if (*word_count == *word_capacity) {
*word_capacity *= 2;
*words = realloc(*words, *word_capacity * sizeof(Word*));
}
(*words)[*word_count] = malloc(sizeof(Word));
(*words)[*word_count]->value = malloc((word_len + 1) * sizeof(char));
strncpy((*words)[*word_count]->value, line + word_start, word_len);
(*words)[*word_count]->value[word_len] = '\0';
str_to_lower((*words)[*word_count]->value);
(*words)[*word_count]->frequency = 1;
(*word_count)++;
} else {
p++;
}
}
return *word_count;
}
// 释放单词占用的内存
void free_words(Word** words, int word_count) {
for (int i = 0; i < word_count; i++) {
free((*words)[i]->value);
free((*words)[i]);
}
free(*words);
}
int main(int argc, char* argv[]) {
if (argc != 2) {
printf("Usage: %s <filename>\n", argv[0]);
return 1;
}
// 打开文件
FILE* fp = fopen(argv[1], "r");
if (!fp) {
printf("Failed to open file: %s\n", argv[1]);
return 1;
}
// 初始化单词表
int word_capacity = 100;
int word_count = 0;
Word** words = malloc(word_capacity * sizeof(Word*));
// 逐行读取文件
char line[MAX_LINE_LEN];
while (fgets(line, MAX_LINE_LEN, fp)) {
extract_words(line, &words, &word_count, &word_capacity);
}
// 关闭文件
fclose(fp);
// 对单词按照出现频率从大到小排序
qsort(words, word_count, sizeof(Word*), cmp);
// 统计单词总数和出现频率最大的前10%的单词数
int total_word_count = 0;
int top_word_count = 0;
for (int i = 0; i < word_count; i++) {
total_word_count += words[i]->frequency;
if (i < word_count / 10) {
top_word_count += words[i]->frequency;
}
}
// 输出结果
printf("Total number of words: %d\n", total_word_count);
printf("Top 10%% words by frequency (total %d words):\n", top_word_count);
for (int i = 0; i < word_count; i++) {
if (i >= word_count / 10) {
break;
}
printf("%s: %d\n", words[i]->value, words[i]->frequency);
}
// 释放单词占用的内存
free_words(&words, word_count);
return 0;
}
```
这个程序可以通过命令行参数指定要统计的英文文本文件,例如:
```
./word_count text.txt
```
运行该程序后,会输出以下两行信息:
```
Total number of words: 12345
Top 10% words by frequency (total 6789 words):
word1: 1234
word2: 567
...
```
第一行表示文本文件中不同单词的个数,第二行则列出了出现频率最大的前10%的单词以及它们的出现次数。
阅读全文