import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt # 读取数据 data = pd.read_csv('D:\\pythonProject\\venv\\BostonHousing2.csv') # 提取前13个指标的数据 X = data.iloc[:, 5:18].values # 数据标准化 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # 主成分分析 pca = PCA() X_pca = pca.fit_transform(X_scaled) # 特征值和特征向量 eigenvalues = pca.explained_variance_ eigenvectors = pca.components_.T # 碎石图 # variance_explained我给你放到下一个cell里面了,这里用eigenvalues代替variance_explained plt.plot(range(1, 14), eigenvalues, marker='o') plt.xlabel('Number of Components') plt.ylabel('Cumulative Proportion of Variance Explained') plt.title('Scree Plot') plt.show() # 选择主成分个数 variance_explained = np.cumsum(eigenvalues / np.sum(eigenvalues)) n_components = np.sum(variance_explained <= 0.95) + 1 # 前2个主成分的载荷图 loadings = pd.DataFrame(eigenvectors[:, 0:2], columns=['PC1', 'PC2'], index=data.columns[0:13]) plt.figure(figsize=(10, 6)) plt.scatter(loadings['PC1'], loadings['PC2'], alpha=0.7) for i, feature in enumerate(loadings.index): plt.text(loadings['PC1'][i], loadings['PC2'][i], feature) plt.xlabel('PC1') plt.ylabel('PC2') plt.title('Loading Plot') plt.grid() plt.show() # 主成分得分图 scores = pd.DataFrame(X_pca[:, 0:n_components], columns=['PC{}'.format(i+1) for i in range(n_components)]) plt.figure(figsize=(10, 6)) plt.scatter(scores['PC1'], scores['PC2'], alpha=0.7) for i, label in enumerate(data['medv']): plt.text(scores['PC1'][i], scores['PC2'][i], label) plt.xlabel('PC1') plt.ylabel('PC2') plt.title('Scores Plot') plt.grid() plt.show() # 综合评估和排序 data['PC1_score'] = X_pca[:, 0] sorted_data = data.sort_values(by='PC1_score') # 主成分回归模型 from sklearn.linear_model import LinearRegression Y = data['medv'].values.reshape(-1, 1) X_pca_regression = X_pca[:, 0].reshape(-1, 1) regression_model = LinearRegression() regression_model.fit(X_pca_regression, Y) # 回归方程 intercept = regression_model.intercept_[0] slope = regression_model.coef_[0][0] equation = "medv = {:.2f} + {:.2f} * PC1".format(intercept, slope) print("Regression Equation:", equation) # 最小二乘估计结果 from statsmodels.api import OLS X_const = np.concatenate((np.ones((506, 1)), X_pca_regression), axis=1) ols_model = OLS(Y, X_const).fit() print("OLS Regression Summary:") print(ols_model.summary())
时间: 2024-02-05 19:03:42 浏览: 32
这段代码是做主成分分析(PCA)的,它的目的是将原始数据转换为更少的几个维度,以便于分析。具体来说,代码将Boston房价数据集中的前13个指标进行了标准化处理,然后使用PCA进行降维。在降维的过程中,代码画出了碎石图(Scree Plot)来帮助选择主成分的个数。然后,代码画出了主成分的载荷图和得分图,分别用于观察主成分和原始数据的关系。最后,代码使用主成分回归模型和最小二乘估计来评估主成分对房价的影响,并输出回归方程和估计结果。
相关问题
优化这段代码# import modules 导入模块 from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA import pandas as pd from sklearn import svm import numpy as np import math import matplotlib.pyplot as plt import matplotlib as mpl from matplotlib import colors from sklearn.model_selection import train_test_split from sklearn import datasets from matplotlib.colors import ListedColormap from sklearn.svm import SVC from sklearn.preprocessing import StandardScaler from sklearn.model_selection import StratifiedShuffleSplit,StratifiedKFold from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV, LeaveOneOut, cross_val_predict from sklearn.model_selection import KFold from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn import svm from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score import math import datetime import multiprocessing as mp from sklearn.ensemble import StackingClassifier from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC import random
你可以对这段代码进行如下优化:
```python
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, LeaveOneOut, cross_val_predict, KFold
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score
# 导入模块
# 设置随机种子
random.seed(42)
np.random.seed(42)
# 其他代码...
```
这样做的优点是将导入的模块进行整理和分类,提高了代码的可读性。同时,设置了随机种子,保证了代码的可复现性。你可以根据需要添加其他的代码或模块。
import pandas as pd import numpy as np import scipy.stats as stats import seaborn as sns from sklearn.metrics import RocCurveDisplay from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix,accuracy_score from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn import tree from sklearn.decomposition import PCA import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeClassifier请在此基础上续写代码块,要求是(1) 读入数据后,选取自变量"sysBP", "diaBP","age","totChol","BMI", "heartRate", "glucose"记为X,因变量"TenYearCHD"记为y,组成新的数据集。¶
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
# 读入数据
data = pd.read_csv("heart.csv")
# 选取自变量
X = data[["sysBP", "diaBP", "age", "totChol", "BMI", "heartRate", "glucose"]]
# 因变量
y = data["TenYearCHD"]
# 组成新的数据集
new_data = pd.concat([X, y], axis=1)
# 打印新的数据集
print(new_data.head())