用python分别实现计算给定文本文件中英语字母、英语单词、汉字、汉语词的熵
时间: 2023-05-24 20:06:44 浏览: 148
使用最大熵模型进行中文文本分类
4星 · 用户满意度95%
英语字母熵的实现:
```python
import math
def english_letter_entropy(file_path):
# 统计英文字母出现次数
letter_counts = {}
with open(file_path, 'r') as f:
for line in f:
for letter in line:
if letter.isalpha() and letter.isascii():
letter = letter.lower()
if letter in letter_counts:
letter_counts[letter] += 1
else:
letter_counts[letter] = 1
# 计算熵值
total_count = sum(letter_counts.values())
entropy = 0
for count in letter_counts.values():
p = count / total_count
entropy -= p * math.log2(p)
return entropy
```
英语单词熵的实现:
```python
import math
import re
def english_word_entropy(file_path):
# 统计单词出现次数
word_counts = {}
with open(file_path, 'r') as f:
for line in f:
words = re.findall('[a-zA-Z]+', line)
for word in words:
word = word.lower()
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
# 计算熵值
total_count = sum(word_counts.values())
entropy = 0
for count in word_counts.values():
p = count / total_count
entropy -= p * math.log2(p)
return entropy
```
汉字熵的实现:
```python
import math
def chinese_character_entropy(file_path):
# 统计汉字出现次数
character_counts = {}
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
for character in line:
if '\u4e00' <= character <= '\u9fff':
if character in character_counts:
character_counts[character] += 1
else:
character_counts[character] = 1
# 计算熵值
total_count = sum(character_counts.values())
entropy = 0
for count in character_counts.values():
p = count / total_count
entropy -= p * math.log2(p)
return entropy
```
汉语词熵的实现:
```python
import math
import jieba
def chinese_word_entropy(file_path):
# 统计汉语词出现次数
word_counts = {}
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
words = jieba.cut(line)
for word in words:
if len(word) > 1:
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
# 计算熵值
total_count = sum(word_counts.values())
entropy = 0
for count in word_counts.values():
p = count / total_count
entropy -= p * math.log2(p)
return entropy
```
阅读全文