用python实现K-means聚类算法
时间: 2023-06-29 14:02:25 浏览: 190
基于python的K-Means聚类算法设计与实现
好的,以下是用Python实现K-means聚类算法的代码:
```python
import numpy as np
from sklearn.datasets import make_blobs
def euclidean_distance(x1, x2):
return np.sqrt(np.sum((x1 - x2)**2))
class KMeans:
def __init__(self, k=5, max_iters=100, plot_steps=False):
self.k = k
self.max_iters = max_iters
self.plot_steps = plot_steps
# list of sample indices for each cluster
self.clusters = [[] for _ in range(self.k)]
# mean feature vector for each cluster
self.centroids = []
def predict(self, X):
self.X = X
self.n_samples, self.n_features = X.shape
# initialize centroids
random_sample_idxs = np.random.choice(self.n_samples, self.k, replace=False)
self.centroids = [self.X[idx] for idx in random_sample_idxs]
# optimization loop
for _ in range(self.max_iters):
# update clusters
self.clusters = self._create_clusters(self.centroids)
if self.plot_steps:
self.plot()
# update centroids
centroids_old = self.centroids.copy()
self.centroids = self._get_centroids(self.clusters)
# check if converged
if self._is_converged(centroids_old, self.centroids):
break
if self.plot_steps:
self.plot()
# return cluster labels
return self._get_cluster_labels(self.clusters)
def _get_cluster_labels(self, clusters):
labels = np.empty(self.n_samples)
for cluster_idx, cluster in enumerate(clusters):
for sample_idx in cluster:
labels[sample_idx] = cluster_idx
return labels
def _create_clusters(self, centroids):
clusters = [[] for _ in range(self.k)]
for idx, sample in enumerate(self.X):
centroid_idx = self._closest_centroid(sample, centroids)
clusters[centroid_idx].append(idx)
return clusters
def _closest_centroid(self, sample, centroids):
distances = [euclidean_distance(sample, point) for point in centroids]
closest_idx = np.argmin(distances)
return closest_idx
def _get_centroids(self, clusters):
centroids = np.zeros((self.k, self.n_features))
for cluster_idx, cluster in enumerate(clusters):
cluster_mean = np.mean(self.X[cluster], axis=0)
centroids[cluster_idx] = cluster_mean
return centroids
def _is_converged(self, centroids_old, centroids):
distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.k)]
return np.sum(distances) == 0
def plot(self):
fig, ax = plt.subplots(figsize=(12, 8))
for i, index in enumerate(self.clusters):
point = self.X[index].T
ax.scatter(*point)
for point in self.centroids:
ax.scatter(*point, marker="x", color='black', linewidth=2)
plt.show()
```
这里我们先定义了一个 `euclidean_distance` 函数来计算两个样本之间的欧几里得距离。然后定义了 `KMeans` 类,其中我们定义了聚类数、最大迭代次数和是否绘制聚类步骤的标志。在 `predict` 方法中进行了优化迭代,首先初始化聚类中心,然后根据聚类中心更新聚类并更新聚类中心,直到算法收敛或达到最大迭代次数。最后返回聚类标签。在 `_create_clusters` 方法中,我们根据每个样本到聚类中心的距离将其分配到相应的聚类中。在 `_get_centroids` 方法中,我们计算每个聚类的平均特征向量,并返回平均特征向量作为聚类中心。在 `_is_converged` 方法中,我们使用欧几里得距离判断算法是否收敛。最后,我们在 `plot` 方法中绘制了每个聚类的数据点和聚类中心。
阅读全文