Python实现数据集的kmeans聚类
时间: 2023-11-29 19:04:43 浏览: 127
K-means是一种经典的聚类算法,其目标是将数据集分成K个簇,使得簇内的点相似度高,簇间相似度低。下面是使用Python实现数据集的k-means聚类的示例代码:
```python
import numpy as np
import matplotlib.pyplot as plt
# 生成随机数据
data = np.random.rand(100, 2)
# 计算点之间的距离
def euclidean_distance(x1, x2):
return np.sqrt(np.sum((x1 - x2)**2))
# 初始化聚类中心
def initialize_centroids(data, k):
centroids = np.zeros((k, data.shape[1]))
for i in range(k):
centroid = data[np.random.choice(range(len(data)))]
centroids[i] = centroid
return centroids
# 分配数据到最近的聚类中心
def assign_clusters(data, centroids):
clusters = [[] for _ in range(len(centroids))]
for sample_idx, sample in enumerate(data):
centroid_idx = closest_centroid(sample, centroids)
clusters[centroid_idx].append(sample_idx)
return clusters
# 计算距离最近的聚类中心
def closest_centroid(sample, centroids):
distances = [euclidean_distance(sample, point) for point in centroids]
closest_idx = np.argmin(distances)
return closest_idx
# 更新聚类中心
def update_centroids(data, clusters):
centroids = np.zeros((len(clusters), data.shape[1]))
for cluster_idx, cluster in enumerate(clusters):
cluster_mean = np.mean(data[cluster], axis=0)
centroids[cluster_idx] = cluster_mean
return centroids
# 判断聚类是否收敛
def has_converged(old_centroids, centroids):
return (set([tuple(a) for a in old_centroids]) == set([tuple(a) for a in centroids]))
# K-means算法
def kmeans(data, k):
centroids = initialize_centroids(data, k)
while True:
old_centroids = centroids.copy()
clusters = assign_clusters(data, centroids)
centroids = update_centroids(data, clusters)
if has_converged(old_centroids, centroids):
break
return centroids, clusters
# 绘制聚类结果
def plot_clusters(data, clusters, centroids):
colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for cluster_idx, cluster in enumerate(clusters):
cluster_color = colors[cluster_idx % len(colors)]
for sample_idx in cluster:
ax.scatter(data[sample_idx][0], data[sample_idx][1], c=cluster_color)
centroid = centroids[cluster_idx]
ax.scatter(centroid[0], centroid[1], c='black', marker='x', s=150)
plt.show()
# 运行K-means算法
centroids, clusters = kmeans(data, k=3)
# 绘制聚类结果
plot_clusters(data, clusters, centroids)
```
该代码生成了一个随机数据集,并将其分成3个簇。可以通过修改数据集和k值来尝试不同的聚类效果。
阅读全文