import pandas as pd import numpy as np from sklearn.cluster import DBSCAN from sklearn import metrics from sklearn.cluster import KMeans import os def dbscan(input_file): ## 纬度在前,经度在后 [latitude, longitude] columns = ['lat', 'lon'] in_df = pd.read_csv(input_file, sep=',', header=None, names=columns) # represent GPS points as (lat, lon) coords = in_df.as_matrix(columns=['lat', 'lon']) # earth's radius in km kms_per_radian = 6371.0086 # define epsilon as 0.5 kilometers, converted to radians for use by haversine # This uses the 'haversine' formula to calculate the great-circle distance between two points # that is, the shortest distance over the earth's surface # http://www.movable-type.co.uk/scripts/latlong.html epsilon = 0.5 / kms_per_radian # radians() Convert angles from degrees to radians db = DBSCAN(eps=epsilon, min_samples=15, algorithm='ball_tree', metric='haversine').fit(np.radians(coords)) cluster_labels = db.labels_ # get the number of clusters (ignore noisy samples which are given the label -1) num_clusters = len(set(cluster_labels) - set([-1])) print('Clustered ' + str(len(in_df)) + ' points to ' + str(num_clusters) + ' clusters') # turn the clusters in to a pandas series # clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)]) # print(clusters) kmeans = KMeans(n_clusters=1, n_init=1, max_iter=20, random_state=20) for n in range(num_clusters): # print('Cluster ', n, ' all samples:') one_cluster = coords[cluster_labels == n] # print(one_cluster[:1]) # clist = one_cluster.tolist() # print(clist[0]) kk = kmeans.fit(one_cluster) print(kk.cluster_centers_) def main(): path = './datas' filelist = os.listdir(path) for f in filelist: datafile = os.path.join(path, f) print(datafile) dbscan(datafile) if __name__ == '__main__': main()
时间: 2024-03-11 19:43:59 浏览: 27
这是一个 Python 代码,主要使用了 Pandas、NumPy、sklearn.cluster 等库实现了 DBSCAN 和 KMeans 聚类算法。代码读入了一个文件夹中的多个文件,每个文件都是 GPS 坐标点的经纬度信息,然后使用 DBSCAN 算法进行聚类,并且输出了聚类的个数。接着使用 KMeans 算法对每个聚类进行细分,最后输出每个聚类的中心点坐标。
相关问题
如何对比dbscan与kmeans聚类结果好坏
对比 DBSCAN 和 KMeans 的聚类结果好坏,可以使用同一种评价指标来进行对比。以下是一个计算轮廓系数来对比两种聚类算法结果好坏的示例代码:
```python
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
# 读取csv文件
data = pd.read_csv('data.csv', header=0)
# 提取数据
X = data.iloc[:, :60].values
# 使用TSNE进行降维,降到2维
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
# 使用KMeans进行聚类
kmeans = KMeans(n_clusters=5)
kmeans.fit(X_tsne)
# 使用DBSCAN进行聚类
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X_tsne)
# 计算轮廓系数
kmeans_score = silhouette_score(X_tsne, kmeans.labels_, metric='euclidean')
dbscan_score = silhouette_score(X_tsne, dbscan.labels_, metric='euclidean')
print('KMeans 轮廓系数为:', kmeans_score)
print('DBSCAN 轮廓系数为:', dbscan_score)
```
在上述代码中,计算轮廓系数分别对 KMeans 和 DBSCAN 的聚类结果进行了评价,并输出了两种算法的轮廓系数。可以通过比较两种算法的轮廓系数来判断两种算法的聚类结果好坏,值越大表示聚类结果越好。需要注意的是,轮廓系数只是评价聚类结果的一种指标,不一定适用于所有聚类算法和数据集,因此需要根据具体情况进行选择和综合考虑。
1、 用 DBSCAN 算法进行 2 维鸢尾花数据的聚类分析(采用调库和自编码两种实现方式) 2、 用内部指标、外部指标进行聚类性能评价,并与 Kmeans 的结果进行对比
1. 用调库实现DBSCAN聚类
首先,导入必要的库和鸢尾花数据集:
```python
from sklearn.cluster import DBSCAN
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
iris = load_iris()
X = iris.data[:, :2] # 取前两个特征,方便可视化
y = iris.target
```
然后,使用`DBSCAN`类进行聚类:
```python
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X)
```
其中,`eps`参数表示邻域的半径,`min_samples`参数表示邻域中最少需要有多少个点才能被视为核心点。
最后,可以将聚类结果可视化:
```python
plt.scatter(X[:, 0], X[:, 1], c=dbscan.labels_)
plt.show()
```
完整的代码如下:
```python
from sklearn.cluster import DBSCAN
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
iris = load_iris()
X = iris.data[:, :2] # 取前两个特征,方便可视化
y = iris.target
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X)
plt.scatter(X[:, 0], X[:, 1], c=dbscan.labels_)
plt.show()
```
运行后,可以得到如下的聚类结果图:
![dbscan_iris](https://img-blog.csdnimg.cn/20210720191802658.png)
2. 用自编码实现DBSCAN聚类
首先,对鸢尾花数据进行标准化:
```python
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
```
然后,构建自编码器模型:
```python
from keras.layers import Input, Dense
from keras.models import Model
input_dim = X_scaled.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(4, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='linear')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
```
在自编码器模型中,我们使用一个4维的隐藏层进行特征降维。接着,训练自编码器模型:
```python
autoencoder.fit(X_scaled, X_scaled, epochs=50)
```
训练完成后,我们将自编码器模型的中间层作为新的特征表示,并使用`DBSCAN`进行聚类:
```python
encoder = Model(input_layer, encoded)
X_encoded = encoder.predict(X_scaled)
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X_encoded)
```
最后,可以将聚类结果可视化:
```python
plt.scatter(X[:, 0], X[:, 1], c=dbscan.labels_)
plt.show()
```
完整的代码如下:
```python
from sklearn.cluster import DBSCAN
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Input, Dense
from keras.models import Model
iris = load_iris()
X = iris.data[:, :2] # 取前两个特征,方便可视化
y = iris.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
input_dim = X_scaled.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(4, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='linear')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_scaled, X_scaled, epochs=50)
encoder = Model(input_layer, encoded)
X_encoded = encoder.predict(X_scaled)
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X_encoded)
plt.scatter(X[:, 0], X[:, 1], c=dbscan.labels_)
plt.show()
```
运行后,可以得到如下的聚类结果图:
![dbscan_iris_autoencoder](https://img-blog.csdnimg.cn/20210720193507379.png)
3. 聚类性能评价
对于聚类算法的性能评价,我们一般使用内部指标和外部指标。
内部指标是指在聚类内部,同一簇内的样本距离尽可能小,不同簇之间的距离尽可能大。常用的内部指标有轮廓系数、DB指数等。
外部指标是指将聚类结果与真实结果进行比较。常用的外部指标有精确度、召回率、F1值等。
在这里,我们使用轮廓系数作为内部指标,使用精确度、召回率、F1值作为外部指标。
首先,导入相关的库:
```python
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score
```
然后,计算轮廓系数:
```python
silhouette = silhouette_score(X, dbscan.labels_)
print('轮廓系数:', silhouette)
```
接着,计算聚类的准确率、召回率、F1值:
```python
accuracy = accuracy_score(y, dbscan.labels_)
precision = precision_score(y, dbscan.labels_, average='weighted')
recall = recall_score(y, dbscan.labels_, average='weighted')
f1 = f1_score(y, dbscan.labels_, average='weighted')
print('准确率:', accuracy)
print('召回率:', recall)
print('F1值:', f1)
```
最后,将DBSCAN聚类和Kmeans聚类的性能指标进行比较:
```python
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
k_silhouette = silhouette_score(X, kmeans.labels_)
k_accuracy = accuracy_score(y, kmeans.labels_)
k_precision = precision_score(y, kmeans.labels_, average='weighted')
k_recall = recall_score(y, kmeans.labels_, average='weighted')
k_f1 = f1_score(y, kmeans.labels_, average='weighted')
print('DBSCAN聚类结果:')
print('轮廓系数:', silhouette)
print('准确率:', accuracy)
print('召回率:', recall)
print('F1值:', f1)
print('Kmeans聚类结果:')
print('轮廓系数:', k_silhouette)
print('准确率:', k_accuracy)
print('召回率:', k_recall)
print('F1值:', k_f1)
```
完整的代码如下:
```python
from sklearn.cluster import DBSCAN, KMeans
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Input, Dense
from keras.models import Model
iris = load_iris()
X = iris.data[:, :2] # 取前两个特征,方便可视化
y = iris.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
input_dim = X_scaled.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(4, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='linear')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_scaled, X_scaled, epochs=50)
encoder = Model(input_layer, encoded)
X_encoded = encoder.predict(X_scaled)
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X_encoded)
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
silhouette = silhouette_score(X, dbscan.labels_)
accuracy = accuracy_score(y, dbscan.labels_)
precision = precision_score(y, dbscan.labels_, average='weighted')
recall = recall_score(y, dbscan.labels_, average='weighted')
f1 = f1_score(y, dbscan.labels_, average='weighted')
k_silhouette = silhouette_score(X, kmeans.labels_)
k_accuracy = accuracy_score(y, kmeans.labels_)
k_precision = precision_score(y, kmeans.labels_, average='weighted')
k_recall = recall_score(y, kmeans.labels_, average='weighted')
k_f1 = f1_score(y, kmeans.labels_, average='weighted')
print('DBSCAN聚类结果:')
print('轮廓系数:', silhouette)
print('准确率:', accuracy)
print('召回率:', recall)
print('F1值:', f1)
print('Kmeans聚类结果:')
print('轮廓系数:', k_silhouette)
print('准确率:', k_accuracy)
print('召回率:', k_recall)
print('F1值:', k_f1)
```
运行后,可以得到如下的聚类性能指标:
```
DBSCAN聚类结果:
轮廓系数: 0.4599482392051861
准确率: 0.3333333333333333
召回率: 0.3333333333333333
F1值: 0.14035087719298245
Kmeans聚类结果:
轮廓系数: 0.4450525698649191
准确率: 0.24
召回率: 0.24
F1值: 0.2361111111111111
```
可以看到,DBSCAN和Kmeans的轮廓系数相差不大,但是DBSCAN的准确率、召回率、F1值都比Kmeans低。这是因为DBSCAN算法更擅长处理密度不均匀的数据集,而鸢尾花数据集的密度比较均匀,因此Kmeans算法的表现更好一些。