k均值算法python
时间: 2023-12-07 22:05:52 浏览: 80
以下是使用Python实现k均值算法的示例代码:
```python
from numpy import *
import matplotlib.pyplot as plt
# 计算两个向量之间的欧氏距离
def euclDistance(vector1, vector2):
return sqrt(sum(power(vector2 - vector1, 2)))
# 初始化数据的质心,k表示簇的个数
def initCentroids(dataSet, k):
numSamples, dim = dataSet.shape
centroids = zeros((k, dim))
for i in range(k):
index = int(random.uniform(0, numSamples))
centroids[i, :] = dataSet[index, :]
return centroids
# k均值聚类算法
def kMeans(dataSet, k):
numSamples = dataSet.shape[0]
# 第一列存储该样本属于哪个簇,第二列存储该样本距离所属簇的距离
clusterAssment = mat(zeros((numSamples, 2)))
clusterChanged = True
# 初始化质心
centroids = initCentroids(dataSet, k)
while clusterChanged:
clusterChanged = False
# 遍历每个样本
for i in range(numSamples):
minDist = 100000.0
minIndex = 0
# 遍历每个质心
for j in range(k):
distance = euclDistance(centroids[j, :], dataSet[i, :])
if distance < minDist:
minDist = distance
minIndex = j
# 如果该样本所属的簇发生了变化
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist ** 2
# 更新质心
for j in range(k):
pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]
centroids[j, :] = mean(pointsInCluster, axis=0)
print('Cluster complete!')
return centroids, clusterAssment
# 可视化聚类结果
def showCluster(dataSet, k, centroids, clusterAssment):
numSamples, dim = dataSet.shape
if dim != 2:
print("Sorry! I can not draw because the dimension of your data is not 2!")
return 1
# 用不同颜色和标记绘制每个簇的点
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
for i in range(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
# 用不同颜色和标记绘制质心
mark = ['*r', '*b', '*g', '*k', '^b', '+b', 'sb', 'db', '<b', 'pb']
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=20)
plt.show()
# 测试
if __name__ == '__main__':
# 生成测试数据
dataMat = mat(random.rand(100, 2))
# 聚类个数为4
k = 4
# 运行k均值聚类算法
centroids, clusterAssment = kMeans(dataMat, k)
# 可视化聚类结果
showCluster(dataMat, k, centroids, clusterAssment)
```
阅读全文