python中的jieba实现对一个文件夹中的所有文档进行分词,根据停用词表去除停用词,基于文档中的非停用词及其TF-IDF值,生成该文档的词向量,具体代码...
时间: 2024-02-18 14:03:36 浏览: 80
好的,以下是基于Python中jieba库、sklearn库和numpy库的代码实现:
```python
import os
import jieba
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# 加载停用词表
stopwords_path = 'stopwords.txt'
stopwords = [line.strip() for line in open(stopwords_path, 'r', encoding='utf-8').readlines()]
def cut_words(file_path):
"""
对文本进行分词,去除停用词
"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
word_list = jieba.cut(content)
word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
return ' '.join(word_list)
def load_files(file_dir):
"""
加载文件夹中的所有文本
"""
file_list = []
for root, dirs, files in os.walk(file_dir):
for file in files:
file_list.append(os.path.join(root, file))
return file_list
def get_tfidf(file_list):
"""
计算tf-idf权重矩阵
"""
corpus = []
for file in file_list:
corpus.append(cut_words(file))
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(corpus)
return tfidf.toarray(), vectorizer.get_feature_names()
def get_word_vector(file_path, tfidf_weight, feature_names):
"""
根据文档中的非停用词及其TF-IDF值生成该文档的词向量
"""
word_dict = {}
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
word_list = jieba.cut(content)
for word in word_list:
if word.strip() not in stopwords:
if word.strip() in word_dict:
word_dict[word.strip()] += 1
else:
word_dict[word.strip()] = 1
word_vector = np.zeros(len(feature_names))
for word, count in word_dict.items():
if word in feature_names:
index = feature_names.index(word)
word_vector[index] = count * tfidf_weight[index]
return word_vector
if __name__ == '__main__':
file_dir = 'data'
file_list = load_files(file_dir)
tfidf_weight, feature_names = get_tfidf(file_list)
for file_path in file_list:
word_vector = get_word_vector(file_path, tfidf_weight, feature_names)
print(f'{file_path} 的词向量为:{word_vector}')
```
其中,假设文本文件夹为`data`,停用词表为`stopwords.txt`,代码中的`cut_words`函数用于对文本进行分词并去除停用词,`load_files`函数用于加载文件夹中的所有文本,`get_tfidf`函数用于计算tf-idf权重矩阵,`get_word_vector`函数用于根据文档中的非停用词及其TF-IDF值生成该文档的词向量。最后,遍历所有文本文件,分别输出其对应的词向量。
阅读全文