实现聚类性能指标DBI, DI, CHI, SI
时间: 2023-07-27 11:34:06 浏览: 552
聚类性能指标是用于评估聚类算法聚类效果的指标,常见的有DBI、DI、CHI、SI等。
以下是这四个指标的实现方法:
(1)DBI(Davies-Bouldin Index):DBI指标越小,说明聚类效果越好。DBI指标的计算方法如下:
```python
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
def compute_centroids(X, labels):
k = np.max(labels) + 1
centroids = np.zeros((k, X.shape[1]))
for i in range(k):
centroids[i] = np.mean(X[labels == i], axis=0)
return centroids
def compute_S(X, labels, centroids):
k = np.max(labels) + 1
S = np.zeros(k)
for i in range(k):
S[i] = np.mean(euclidean_distances(X[labels == i], [centroids[i]]))
return S
def compute_R(X, labels, centroids):
k = np.max(labels) + 1
R = np.zeros((k, k))
for i in range(k):
for j in range(k):
if i != j:
R[i][j] = (S[i] + S[j]) / euclidean_distances([centroids[i]], [centroids[j]])
return R
def compute_DBI(X, labels):
k = np.max(labels) + 1
centroids = compute_centroids(X, labels)
S = compute_S(X, labels, centroids)
R = compute_R(X, labels, centroids)
DBI = 0.0
for i in range(k):
max_R = np.max(R[i, [j for j in range(k) if j != i]])
DBI += max_R + S[i]
return DBI / k
```
(2)DI(Dunn Index):DI指标越大,说明聚类效果越好。DI指标的计算方法如下:
```python
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
def compute_min_intercluster_distances(X, labels):
k = np.max(labels) + 1
min_intercluster_distances = np.full((k, k), np.inf)
for i in range(k):
for j in range(i + 1, k):
dist = np.min(euclidean_distances(X[labels == i], X[labels == j]))
min_intercluster_distances[i][j] = dist
min_intercluster_distances[j][i] = dist
return min_intercluster_distances
def compute_max_intracluster_diameter(X, labels):
k = np.max(labels) + 1
max_intracluster_diameter = np.zeros(k)
for i in range(k):
dist = euclidean_distances(X[labels == i])
max_intracluster_diameter[i] = np.max(dist) if len(dist) > 0 else 0
return max_intracluster_diameter
def compute_DI(X, labels):
min_intercluster_distances = compute_min_intercluster_distances(X, labels)
max_intracluster_diameter = compute_max_intracluster_diameter(X, labels)
DI = np.min(min_intercluster_distances) / np.max(max_intracluster_diameter)
return DI
```
(3)CHI(Calinski-Harabasz Index):CHI指标越大,说明聚类效果越好。CHI指标的计算方法如下:
```python
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
def compute_centroids(X, labels):
k = np.max(labels) + 1
centroids = np.zeros((k, X.shape[1]))
for i in range(k):
centroids[i] = np.mean(X[labels == i], axis=0)
return centroids
def compute_SSB(X, labels, centroids):
k = np.max(labels) + 1
SSB = 0.0
overall_centroid = np.mean(X, axis=0)
for i in range(k):
n = len(X[labels == i])
SSB += n * euclidean_distances([centroids[i]], [overall_centroid])
return SSB
def compute_SSW(X, labels, centroids):
k = np.max(labels) + 1
SSW = 0.0
for i in range(k):
SSW += np.sum(euclidean_distances(X[labels == i], [centroids[i]]) ** 2)
return SSW
def compute_CHI(X, labels):
k = np.max(labels) + 1
centroids = compute_centroids(X, labels)
SSB = compute_SSB(X, labels, centroids)
SSW = compute_SSW(X, labels, centroids)
CHI = (SSB / (k - 1)) / (SSW / (len(X) - k))
return CHI
```
(4)SI(Silhouette Index):SI指标越大,说明聚类效果越好。SI指标的计算方法如下:
```python
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
def compute_a(X, i, labels):
a = np.mean(euclidean_distances(X[i], X[labels == labels[i]]))
return a
def compute_b(X, i, labels):
k = np.max(labels) + 1
b = np.inf
for j in range(k):
if j != labels[i]:
dist = np.mean(euclidean_distances(X[i], X[labels == j]))
if dist < b:
b = dist
return b
def compute_SI(X, labels):
a = np.zeros(len(X))
b = np.zeros(len(X))
for i in range(len(X)):
a[i] = compute_a(X, i, labels)
b[i] = compute_b(X, i, labels)
SI = np.mean((b - a) / np.maximum(a, b))
return SI
```
以上就是DBI、DI、CHI、SI指标的Python实现方法,可以根据需要选择其中一个或多个指标来评估聚类效果。
阅读全文