#二分k均值算法(补全代码) """ Parameters: dataSet - 数据集 k - 聚类数 distMeas - 距离类型 Returns: centroids - 类中心数组 clusterAssment - 簇分配结果(包括索引,误差两列) """ # 算法的基本流程 #首先将所有点作为一个属,然后将该簇-分为二,之后选择其中-个簇进续进行划分,选择哪一个簇进行划取决于对其划分是否可以最大程度降低SSE的值,上述基于SSE的别分过程不断重复,直到得到用户指定的属数目为止, #将所有点看成一个簇 #当簇数目小于k时 # 对于每一个簇: # 计算总误差 # 在给定的簇上面进行K-均值聚类(k=2)计算将该簇一分为二后的总误差 # 选择使得误差最小的那个簇进行划分操作 def biKmeans(dataSet, k, distMeas=distEclud):
时间: 2024-03-22 12:39:41 浏览: 51
```python
def biKmeans(dataSet, k, distMeas=distEclud):
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m, 2)))
centroid0 = mean(dataSet, axis=0).tolist()[0]
centList = [centroid0]
for j in range(m):
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
while (len(centList) < k):
lowestSSE = inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
sseSplit = sum(splitClustAss[:,1])
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
print('the bestCentToSplit is: ',bestCentToSplit)
print('the len of bestClustAss is: ', len(bestClustAss))
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
centList.append(bestNewCents[1,:].tolist()[0])
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss
return mat(centList), clusterAssment
```
阅读全文