python中的jieba实现对一个文件夹中的所有文档进行分词,根据停用词表去除停用词,基于文档中的非停用词及其TF-IDF值,生成该文档的词向量,具体代码...
时间: 2024-02-22 16:56:24 浏览: 22
好的,下面是代码实现:
```python
import os
import jieba
import math
# 加载停用词表
stopwords = set()
with open('stopwords.txt', 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
# 定义函数用于分词和去除停用词
def cut_words(file_path):
words = []
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
seg_list = jieba.cut(content)
for word in seg_list:
if word not in stopwords:
words.append(word)
return words
# 定义函数用于计算TF-IDF值
def calculate_tfidf(words, word_dict, idf_dict, total_files):
tfidf_dict = {}
word_count = len(words)
for word in words:
if word not in tfidf_dict:
tf_dict = {}
tf_dict[word] = 1
tfidf_dict[word] = tf_dict
else:
tf_dict = tfidf_dict[word]
if word not in tf_dict:
tf_dict[word] = 1
else:
tf_dict[word] += 1
for word, tf_dict in tfidf_dict.items():
idf = math.log(total_files / idf_dict[word])
for w, tf in tf_dict.items():
tfidf = tf / word_count * idf
tfidf_dict[word] = tfidf
return tfidf_dict
# 定义函数用于生成词向量
def generate_vector(tfidf_dict, word_dict):
vector = [0] * len(word_dict)
for word, tfidf in tfidf_dict.items():
if word in word_dict:
vector[word_dict[word]] = tfidf
return vector
# 主程序
def main():
# 读取文件夹中的所有文件
file_list = os.listdir('data')
total_files = len(file_list)
# 统计每个词的出现次数,用于计算 IDF 值
word_dict = {}
for file_name in file_list:
words = cut_words(os.path.join('data', file_name))
for word in words:
if word not in word_dict:
word_dict[word] = 1
else:
word_dict[word] += 1
# 计算每个词的 IDF 值
idf_dict = {}
for word, count in word_dict.items():
idf_dict[word] = math.log(total_files / count)
# 生成词向量
vectors = []
for file_name in file_list:
words = cut_words(os.path.join('data', file_name))
tfidf_dict = calculate_tfidf(words, word_dict, idf_dict, total_files)
vector = generate_vector(tfidf_dict, word_dict)
vectors.append(vector)
# 输出词向量
for vector in vectors:
print(vector)
if __name__ == '__main__':
main()
```
这段代码实现了对 `data` 文件夹中的所有文档进行分词,去除停用词,然后基于文档中的非停用词及其 TF-IDF 值生成词向量,并输出词向量。需要注意的是,这段代码中的停用词表文件为 `stopwords.txt`,需要提前准备好。