自选数据,编程程序,对比分析k-means聚类方法和DBSCAN聚类方法的性能,讨论不同参数对聚类性能的影响,并分析两类算法的优缺点
时间: 2024-04-19 14:24:44 浏览: 9
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
# 生成随机数据集
X, y = make_blobs(n_samples=500, centers=4, random_state=42)
# K-means聚类
kmeans = KMeans(n_clusters=4)
kmeans_labels = kmeans.fit_predict(X)
kmeans_silhouette = silhouette_score(X, kmeans_labels)
# DBSCAN聚类
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)
dbscan_silhouette = silhouette_score(X, dbscan_labels)
print("K-means Clustering:")
print("Silhouette Score: ", kmeans_silhouette)
print("\n")
print("DBSCAN Clustering:")
print("Silhouette Score: ", dbscan_silhouette)
print("\n")
# 分析不同参数对聚类性能的影响
eps_values = [0.2, 0.5, 1.0, 1.5]
min_samples_values = [3, 5, 10]
silhouette_scores = []
for eps in eps_values:
for min_samples in min_samples_values:
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan_labels = dbscan.fit_predict(X)
silhouette = silhouette_score(X, dbscan_labels)
silhouette_scores.append((eps, min_samples, silhouette))
silhouette_scores_df = pd.DataFrame(silhouette_scores, columns=['eps', 'min_samples', 'silhouette_score'])
print("Silhouette Scores with Different Parameters:")
print(silhouette_scores_df)
print("\n")
# 分析K-means和DBSCAN的优缺点
print("Advantages of K-means Clustering:")
print("- Simple and easy to implement")
print("- Fast and efficient for large datasets")
print("- Guarantees convergence to a local optimum")
print("\n")
print("Disadvantages of K-means Clustering:")
print("- Requires the number of clusters to be specified in advance")
print("- Sensitive to the initial choice of centroids")
print("- Not suitable for non-linearly separable data")
print("\n")
print("Advantages of DBSCAN Clustering:")
print("- Does not require the number of clusters to be specified in advance")
print("- Can discover clusters of arbitrary shape")
print("- Robust to noise and outliers")
print("\n")
print("Disadvantages of DBSCAN Clustering:")
print("- Sensitive to the choice of parameters (e.g., epsilon and min_samples)")
print("- Scalability issues for large datasets with high dimensionality")
print("- Difficulty in determining appropriate parameter values")