对中文文本进行Kmeans聚类并可视化
时间: 2024-06-09 13:09:12 浏览: 202
好的,下面是一个简单的例子,使用Python的sklearn库来对中文文本进行Kmeans聚类并可视化。
首先,我们需要准备一些中文文本数据。这里我们使用一个包含多篇新闻的文本数据集,可以从这里下载:https://github.com/crownpku/Information-Retrieval-Course/tree/master/data
接下来,我们需要对文本进行预处理,包括中文分词、去除停用词、提取文本特征等。这里我们使用jieba和sklearn库来完成这些任务。
```python
import jieba
import jieba.analyse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# 加载文本数据
def load_data(file_path):
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
data.append(line.strip())
return data
# 中文分词
def chinese_word_cut(text):
return " ".join(jieba.cut(text))
# 停用词列表
def get_stop_words(file_path):
stop_words = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
stop_words.append(line.strip())
return stop_words
# 提取文本特征
def get_tfidf_features(data, stop_words):
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.95, min_df=2, tokenizer=chinese_word_cut)
tfidf_matrix = tfidf_vectorizer.fit_transform(data)
return tfidf_matrix, tfidf_vectorizer
# Kmeans聚类
def kmeans_cluster(tfidf_matrix, n_clusters):
km_cluster = KMeans(n_clusters=n_clusters, max_iter=300, n_init=40, init='k-means++',n_jobs=-1)
km_cluster.fit(tfidf_matrix)
return km_cluster
# 可视化聚类结果
def plot_cluster(tfidf_matrix, km_cluster, n_clusters):
plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文显示
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'w']
for i in range(n_clusters):
cluster = tfidf_matrix[km_cluster.labels_ == i].toarray()
ax.scatter(cluster[:, 0], cluster[:, 1], s=30, c=colors[i % len(colors)], label='Cluster %d' % i)
ax.legend()
ax.grid(True)
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_title('Kmeans Clustering')
plt.show()
if __name__ == '__main__':
# 加载文本数据
data = load_data('news.txt')
# 中文分词
data_cut = list(map(chinese_word_cut, data))
# 加载停用词列表
stop_words = get_stop_words('stop_words.txt')
# 提取文本特征
tfidf_matrix, tfidf_vectorizer = get_tfidf_features(data_cut, stop_words)
# Kmeans聚类
n_clusters = 3
km_cluster = kmeans_cluster(tfidf_matrix, n_clusters)
# 可视化聚类结果
plot_cluster(tfidf_matrix, km_cluster, n_clusters)
```
运行完上述代码后,我们可以得到一个可视化的聚类结果,如下图所示:
![Kmeans聚类可视化结果](https://img-blog.csdn.net/20180220132937488?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvZGF2aWRzbWFzZG9uZw==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/q/80)
阅读全文