import pandas as pd import numpy as np from scipy import stats from sklearn.cluster import KMeans import matplotlib.pyplot as plt
时间: 2023-09-23 17:10:46 浏览: 64
这段代码的作用是导入一些常用的机器学习和数据分析库,包括pandas、numpy、scipy、sklearn和matplotlib.pyplot。其中,pandas和numpy用于数据处理和操作,scipy用于科学计算,sklearn用于机器学习算法,matplotlib用于数据可视化。另外,这段代码还使用了KMeans算法来进行聚类分析。
相关问题
import numpy as np import pandas as pd from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from scipy.spatial.distance import cdist import matplotlib.pyplot as plt from pandas import DataFrame from sklearn.decomposition import PCA plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False #用来正常显示负号 pd.set_option('display.max_rows', None)#显示全部行 pd.set_option('display.max_columns', None)#显示全部列 np.set_printoptions(threshold=np.inf) pd.set_option('display.max_columns', 9000) pd.set_option('display.width', 9000) pd.set_option('display.max_colwidth', 9000) df = pd.read_csv(r'附件1.csv',encoding='gbk') X = np.array(df.iloc[:, 1:]) X=X[0:,1:] k=93 kmeans_model = KMeans(n_clusters=k, random_state=123) fit_kmeans = kmeans_model.fit(X) # 模型训练 #查看聚类结果 kmeans_cc = kmeans_model.cluster_centers_ # 聚类中心 print('各类聚类中心为:\n', kmeans_cc) kmeans_labels = kmeans_model.labels_ # 样本的类别标签 print('各样本的类别标签为:\n', kmeans_labels) r1 = pd.Series(kmeans_model.labels_).value_counts() # 统计不同类别样本的数目 print('最终每个类别的数目为:\n', r1) # 输出聚类分群的结果 # cluster_center = pd.DataFrame(kmeans_model.cluster_centers_, # columns=[ str(x) for x in range(1,94)]) # 将聚类中心放在数据框中 # cluster_center.index = pd.DataFrame(kmeans_model.labels_). \ # drop_duplicates().iloc[:, 0] # 将样本类别作为数据框索引 # print(cluster_center)代码解释
这段代码主要是使用KMeans算法对一个数据集进行聚类分析,并输出聚类结果。下面是代码的解释:
1. 导入所需的库
```python
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.decomposition import PCA
```
2. 读取数据集
```python
df = pd.read_csv(r'附件1.csv',encoding='gbk')
X = np.array(df.iloc[:, 1:])
X=X[0:,1:]
```
3. 使用KMeans算法进行聚类分析
```python
k=93
kmeans_model = KMeans(n_clusters=k, random_state=123)
fit_kmeans = kmeans_model.fit(X)
```
4. 输出聚类结果
```python
kmeans_cc = kmeans_model.cluster_centers_
print('各类聚类中心为:\n', kmeans_cc)
kmeans_labels = kmeans_model.labels_
print('各样本的类别标签为:\n', kmeans_labels)
r1 = pd.Series(kmeans_model.labels_).value_counts()
print('最终每个类别的数目为:\n', r1)
```
上述代码中,kmeans_cc表示各个类别的聚类中心,kmeans_labels表示每个样本所属的类别,r1表示每个类别的样本数目。
5. 将聚类中心放在数据框中
```python
# cluster_center = pd.DataFrame(kmeans_model.cluster_centers_,
# columns=[ str(x) for x in range(1,94)])
# 将聚类中心放在数据框中
# cluster_center.index = pd.DataFrame(kmeans_model.labels_). \
# drop_duplicates().iloc[:, 0]
# 将样本类别作为数据框索引
# print(cluster_center)
```
这段代码是将聚类中心放在数据框中,并以样本类别作为索引。但是,这段代码被注释掉了,因此不会被执行。
代码改进:import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt from sklearn.datasets import make_blobs def distEclud(arrA,arrB): #欧氏距离 d = arrA - arrB dist = np.sum(np.power(d,2),axis=1) #差的平方的和 return dist def randCent(dataSet,k): #寻找质心 n = dataSet.shape[1] #列数 data_min = dataSet.min() data_max = dataSet.max() #生成k行n列处于data_min到data_max的质心 data_cent = np.random.uniform(data_min,data_max,(k,n)) return data_cent def kMeans(dataSet,k,distMeans = distEclud, createCent = randCent): x,y = make_blobs(centers=100)#生成k质心的数据 x = pd.DataFrame(x) m,n = dataSet.shape centroids = createCent(dataSet,k) #初始化质心,k即为初始化质心的总个数 clusterAssment = np.zeros((m,3)) #初始化容器 clusterAssment[:,0] = np.inf #第一列设置为无穷大 clusterAssment[:,1:3] = -1 #第二列放本次迭代点的簇编号,第三列存放上次迭代点的簇编号 result_set = pd.concat([pd.DataFrame(dataSet), pd.DataFrame(clusterAssment)],axis = 1,ignore_index = True) #将数据进行拼接,横向拼接,即将该容器放在数据集后面 clusterChanged = True while clusterChanged: clusterChanged = False for i in range(m): dist = distMeans(dataSet.iloc[i,:n].values,centroids) #计算点到质心的距离(即每个值到质心的差的平方和) result_set.iloc[i,n] = dist.min() #放入距离的最小值 result_set.iloc[i,n+1] = np.where(dist == dist.min())[0] #放入距离最小值的质心标号 clusterChanged = not (result_set.iloc[:,-1] == result_set.iloc[:,-2]).all() if clusterChanged: cent_df = result_set.groupby(n+1).mean() #按照当前迭代的数据集的分类,进行计算每一类中各个属性的平均值 centroids = cent_df.iloc[:,:n].values #当前质心 result_set.iloc[:,-1] = result_set.iloc[:,-2] #本次质心放到最后一列里 return centroids, result_set x = np.random.randint(0,100,size=100) y = np.random.randint(0,100,size=100) randintnum=pd.concat([pd.DataFrame(x), pd.DataFrame(y)],axis = 1,ignore_index = True) #randintnum_test, randintnum_test = kMeans(randintnum,3) #plt.scatter(randintnum_test.iloc[:,0],randintnum_test.iloc[:,1],c=randintnum_test.iloc[:,-1]) #result_test,cent_test = kMeans(data, 4) cent_test,result_test = kMeans(randintnum, 3) plt.scatter(result_test.iloc[:,0],result_test.iloc[:,1],c=result_test.iloc[:,-1]) plt.scatter(cent_test[:,0],cent_test[:,1],color = 'red',marker = 'x',s=100)
这段代码实现的是k-means聚类算法,但是有一些地方可以改进:
1. 函数kMeans中的createCent参数可以通过修改为直接传入质心的方式来更加灵活。
2. 函数kMeans中的distMeans参数可以修改为scipy库中的距离函数,如scipy.spatial.distance.cdist,来提高计算效率。
3. 函数kMeans中的clusterAssment可以使用pd.DataFrame来进行初始化,这样可以避免使用np.zeros和np.inf来进行初始化。
改进后的代码如下:
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from scipy.spatial.distance import cdist
def randCent(dataSet, k):
"""
随机生成k个质心
"""
n = dataSet.shape[1] # 列数
data_min = dataSet.min()
data_max = dataSet.max()
# 生成k行n列处于data_min到data_max的质心
data_cent = np.random.uniform(data_min, data_max, (k, n))
return data_cent
def kMeans(dataSet, k, createCent=randCent, distMeans=cdist):
"""
k-means聚类算法
"""
centroids = createCent(dataSet, k) # 初始化质心,k即为初始化质心的总个数
result_set = pd.concat([pd.DataFrame(dataSet), pd.DataFrame()], axis=1, ignore_index=True)
# 将数据进行拼接,横向拼接,即将该容器放在数据集后面
clusterChanged = True
while clusterChanged:
clusterChanged = False
dist = distMeans(dataSet, centroids, metric='euclidean')
clusterAssment = np.argmin(dist, axis=1)
result_set.iloc[:, -1] = pd.Series(clusterAssment)
for i in range(k):
cent_df = result_set[result_set.iloc[:, -1] == i].mean() # 按照当前迭代的数据集的分类,进行计算每一类中各个属性的平均值
if not cent_df.empty:
centroids[i] = cent_df.iloc[:-1].values # 当前质心
clusterChanged = True
return centroids, result_set
x = np.random.randint(0, 100, size=100)
y = np.random.randint(0, 100, size=100)
randintnum = pd.concat([pd.DataFrame(x), pd.DataFrame(y)], axis=1, ignore_index=True)
cent_test, result_test = kMeans(randintnum, 3)
plt.scatter(result_test.iloc[:, 0], result_test.iloc[:, 1], c=result_test.iloc[:, -1])
plt.scatter(cent_test[:, 0], cent_test[:, 1], color='red', marker='x', s=100)
```