解释代码if clusterAssment[i, 0] != minIndex: clusterChange = True clusterAssment[i, :] = minIndex, minDist ** 2
时间: 2023-05-24 22:05:01 浏览: 47
这段代码是对聚类算法中的聚类中心(centroid)的更新操作。其中,clusterAssment变量是一个矩阵,表示每个样本点所属的簇以及其到簇中心点的距离平方。minIndex和minDist代表当前样本点离哪个聚类中心最近以及其距离。代码中的if语句判断如果当前点已经属于它到达的最近簇,则直接跳过,否则需要将当前点归入到最近的簇中,并更新它与聚类中心的距离平方,同时设置一个标志位clusterChange为True,表示聚类结果发生了改变。
相关问题
python实现k-means算法简单案例
好的,下面是一个简单的Python实现k-means算法的案例:
```python
import numpy as np
import matplotlib.pyplot as plt
def kMeans(dataSet, k):
m, n = dataSet.shape
centroids = np.zeros((k, n))
for i in range(k):
index = int(np.random.uniform(0, m))
centroids[i, :] = dataSet[index, :]
clusterChanged = True
while clusterChanged:
clusterChanged = False
clusterAssment = np.zeros((m, 2))
for i in range(m):
minDist = np.inf
minIndex = -1
for j in range(k):
dist = np.sqrt(np.sum(np.power(dataSet[i, :] - centroids[j, :], 2)))
if dist < minDist:
minDist = dist
minIndex = j
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist ** 2
for j in range(k):
pointsInCluster = dataSet[np.nonzero(clusterAssment[:, 0] == j)]
centroids[j, :] = np.mean(pointsInCluster, axis=0)
return centroids, clusterAssment
def show(dataSet, k, centroids, clusterAssment):
m, n = dataSet.shape
if n != 2:
print("Dimension of dataSet should be 2!")
return
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
for i in range(k):
pointsInCluster = dataSet[np.nonzero(clusterAssment[:, 0] == i)]
plt.scatter(pointsInCluster[:, 0], pointsInCluster[:, 1], marker='o', c=colors[i % len(colors)], alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='k', s=100, linewidths=3)
plt.title('k-means')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
if __name__ == '__main__':
data = np.random.rand(200, 2)
k = 3
centroids, clusterAssment = kMeans(data, k)
show(data, k, centroids, clusterAssment)
```
该案例中,我们首先随机生成一组数据,然后输入数据和k值,即可得到聚类结果。其中,函数`kMeans()`实现了k-means算法,函数`show()`用于绘制聚类结果的图表。
k均值算法python
以下是使用Python实现k均值算法的示例代码:
```python
from numpy import *
import matplotlib.pyplot as plt
# 计算两个向量之间的欧氏距离
def euclDistance(vector1, vector2):
return sqrt(sum(power(vector2 - vector1, 2)))
# 初始化数据的质心,k表示簇的个数
def initCentroids(dataSet, k):
numSamples, dim = dataSet.shape
centroids = zeros((k, dim))
for i in range(k):
index = int(random.uniform(0, numSamples))
centroids[i, :] = dataSet[index, :]
return centroids
# k均值聚类算法
def kMeans(dataSet, k):
numSamples = dataSet.shape[0]
# 第一列存储该样本属于哪个簇,第二列存储该样本距离所属簇的距离
clusterAssment = mat(zeros((numSamples, 2)))
clusterChanged = True
# 初始化质心
centroids = initCentroids(dataSet, k)
while clusterChanged:
clusterChanged = False
# 遍历每个样本
for i in range(numSamples):
minDist = 100000.0
minIndex = 0
# 遍历每个质心
for j in range(k):
distance = euclDistance(centroids[j, :], dataSet[i, :])
if distance < minDist:
minDist = distance
minIndex = j
# 如果该样本所属的簇发生了变化
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist ** 2
# 更新质心
for j in range(k):
pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]
centroids[j, :] = mean(pointsInCluster, axis=0)
print('Cluster complete!')
return centroids, clusterAssment
# 可视化聚类结果
def showCluster(dataSet, k, centroids, clusterAssment):
numSamples, dim = dataSet.shape
if dim != 2:
print("Sorry! I can not draw because the dimension of your data is not 2!")
return 1
# 用不同颜色和标记绘制每个簇的点
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
for i in range(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
# 用不同颜色和标记绘制质心
mark = ['*r', '*b', '*g', '*k', '^b', '+b', 'sb', 'db', '<b', 'pb']
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=20)
plt.show()
# 测试
if __name__ == '__main__':
# 生成测试数据
dataMat = mat(random.rand(100, 2))
# 聚类个数为4
k = 4
# 运行k均值聚类算法
centroids, clusterAssment = kMeans(dataMat, k)
# 可视化聚类结果
showCluster(dataMat, k, centroids, clusterAssment)
```