写一段k-means聚类的python代码
时间: 2023-12-01 19:10:01 浏览: 60
K-means聚类python代码
好的,以下是一个简单的 k-means 聚类的 Python 代码,用于将数据集分为 K 个簇:
```python
import numpy as np
# 定义距离函数
def euclidean_distance(x1, x2):
return np.sqrt(np.sum((x1 - x2) ** 2))
class KMeans:
def __init__(self, k=5, max_iters=100, plot_steps=False):
self.k = k
self.max_iters = max_iters
self.plot_steps = plot_steps
# 初始化聚类中心
self.centroids = None
def _initialize_centroids(self, X):
# 随机选择 K 个数据点作为初始聚类中心
n_samples, n_features = X.shape
self.centroids = X[np.random.choice(range(n_samples), self.k, replace=False)]
def _create_clusters(self, X):
# 将样本点分配到最近的聚类中心
clusters = [[] for _ in range(self.k)]
for idx, sample in enumerate(X):
centroid_idx = self._closest_centroid(sample)
clusters[centroid_idx].append(idx)
return clusters
def _closest_centroid(self, sample):
# 计算样本点到各聚类中心的距离,返回距离最近的聚类中心的索引
distances = [euclidean_distance(sample, point) for point in self.centroids]
closest_idx = np.argmin(distances)
return closest_idx
def _calculate_centroids(self, X, clusters):
# 计算每个簇的平均值,更新聚类中心
n_features = X.shape[1]
centroids = np.zeros((self.k, n_features))
for idx, cluster in enumerate(clusters):
cluster_mean = np.mean(X[cluster], axis=0)
centroids[idx] = cluster_mean
self.centroids = centroids
def _get_cluster_labels(self, clusters, X):
# 返回每个样本所属的簇的索引
y_pred = np.zeros(X.shape[0])
for cluster_idx, cluster in enumerate(clusters):
for sample_idx in cluster:
y_pred[sample_idx] = cluster_idx
return y_pred
def predict(self, X):
# 进行 KMeans 聚类
self._initialize_centroids(X)
for i in range(self.max_iters):
clusters = self._create_clusters(X)
if self.plot_steps:
self.plot(X, self.centroids, clusters)
prev_centroids = self.centroids
self._calculate_centroids(X, clusters)
if self._is_converged(prev_centroids, self.centroids):
break
clusters = self._create_clusters(X)
if self.plot_steps:
self.plot(X, self.centroids, clusters)
return self._get_cluster_labels(clusters, X)
def _is_converged(self, prev_centroids, centroids):
# 判断聚类中心是否发生变化
return np.array_equal(prev_centroids, centroids)
```
使用示例:
```python
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
# 生成随机数据
X, y = make_blobs(centers=3, n_samples=500, random_state=1)
# 创建 KMeans 对象
kmeans = KMeans(k=3, max_iters=100, plot_steps=True)
# 进行聚类
y_pred = kmeans.predict(X)
# 显示聚类结果
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.show()
```
阅读全文