时间: 2023-07-08 08:44:50 浏览: 139
5星 · 资源好评率100%
1. 文本词频统计
import collections
import re
# 读取文本文件
with open('sample.txt', 'r') as f:
text = f.read()
# 去除标点符号和换行符
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\n', ' ', text)
# 将文本分割成单词列表
words = text.lower().split()
# 统计每个单词的出现次数
word_counts = collections.Counter(words)
# 打印出现频率前10的单词
for word, count in word_counts.most_common(10):
print(word, count)
the 2
quick 1
brown 1
fox 1
jumps 1
over 1
lazy 1
dog 1
2. Hamlet英文词频统计
import collections
import re
# 读取《哈姆雷特》文本文件
with open('hamlet.txt', 'r') as f:
text = f.read()
# 去除标点符号和换行符
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\n', ' ', text)
# 将文本分割成单词列表
words = text.lower().split()
# 统计每个单词的出现次数
word_counts = collections.Counter(words)
# 打印出现频率前20的单词
for word, count in word_counts.most_common(20):
print(word, count)
the 1085
and 935
to 707
of 670
i 562
a 542
you 527
my 502
in 451
that 410
is 370
it 362
not 361
with 329
this 319
for 316
be 312
his 309
but 307
your 303
3. 《三国演义》人物出场统计
import collections
import re
# 读取《三国演义》文本文件
with open('sanguo.txt', 'r', encoding='utf-8') as f:
text = f.read()
# 去除标点符号和换行符
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\n', ' ', text)
# 定义人物列表
characters = ['曹操', '刘备', '孙权', '诸葛亮', '关羽', '张飞', '赵云', '周瑜', '吕布', '貂蝉']
# 统计每个人物的出现次数
character_counts = collections.Counter(re.findall("|".join(characters), text))
# 打印出现次数前10的人物
for character, count in character_counts.most_common(10):
print(character, count)
曹操 2289
刘备 2151
孙权 1259
关羽 1053
张飞 773
诸葛亮 747
周瑜 666
赵云 516
吕布 506
貂蝉 356