用代码完成high_diamond_ranked_10min.csv处理和特征工程,首先是写入对应数据信息的探索与分析,进行数据预处理用数据类型转换或编码,按过滤法对数据进行特征选择,挑选出最优特征数,对两类数据用PCA算法降到2维后,进行可视化展示。对完整数据进PCA降维,用碎石图选择合适的降维后特征范围。在一个图中绘制不同特征数对应决策树和随机森林准确率效果折线对比图。分别输出决策树和随机森林总特征数,和对应的准确率、输出特征过滤后的特征数,和对应的准确率、PCA降维后的特征数,和对应的准确率。
时间: 2024-03-19 20:45:44 浏览: 178
好的,我会根据您的要求进行处理和特征工程的代码实现。以下是Python代码示例:
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from yellowbrick.features import PCA as YBPCA
from yellowbrick.features import Rank1D, Rank2D
# 读入数据
data = pd.read_csv('high_diamond_ranked_10min.csv')
# 数据探索与分析
print(data.describe())
print(data.info())
print(data.isnull().sum())
# 数据预处理
# 将字符串类型的数据编码为数字
le = LabelEncoder()
data['blueFirstBlood'] = le.fit_transform(data['blueFirstBlood'])
data['blueWins'] = le.fit_transform(data['blueWins'])
# 数据类型转换
data = data.astype('float64')
# 过滤法特征选择
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
idxs_selected = selector.get_support(indices=True)
features = X.columns[idxs_selected]
X_selected = X[features]
# PCA降维可视化
scaler = StandardScaler()
X_pca = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_pca)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar()
plt.show()
# PCA降维选择特征范围
pca = YBPCA(scale=True, proj_features=True, projection=2)
pca.fit(X)
pca.transform(X)
plt.figure(figsize=(12, 10))
plt.title("PC1 vs PC2")
plt.scatter(pca.transform(X)[:, 0], pca.transform(X)[:, 1], c=y, cmap='viridis')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar()
plt.show()
# 决策树和随机森林准确率对比图
accuracy_dt = []
accuracy_rf = []
feature_counts = range(1, 21)
for count in feature_counts:
selector = SelectKBest(f_classif, k=count)
selector.fit(X, y)
idxs_selected = selector.get_support(indices=True)
features = X.columns[idxs_selected]
X_selected = X[features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
accuracy_dt.append(accuracy_score(y_test, y_pred_dt))
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf.append(accuracy_score(y_test, y_pred_rf))
plt.plot(feature_counts, accuracy_dt, label='Decision Tree')
plt.plot(feature_counts, accuracy_rf, label='Random Forest')
plt.xlabel('Number of features')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# 输出结果
print("Decision Tree:")
print("Total features: ", len(X.columns))
print("Accuracy: ", max(accuracy_dt))
print("Selected features: ", feature_counts[accuracy_dt.index(max(accuracy_dt))])
print("\nRandom Forest:")
print("Total features: ", len(X.columns))
print("Accuracy: ", max(accuracy_rf))
print("Selected features: ", feature_counts[accuracy_rf.index(max(accuracy_rf))])
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
idxs_selected = selector.get_support(indices=True)
features = X.columns[idxs_selected]
X_selected = X[features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("\nFiltered features:")
print("Number of features: ", len(features))
print("Accuracy: ", accuracy_score(y_test, y_pred))
scaler = StandardScaler()
X_pca = scaler.fit_transform(X)
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_pca)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("\nPCA:")
print("Number of features: ", 10)
print("Accuracy: ", accuracy_score(y_test, y_pred))
```
注意:以上代码仅供参考,具体实现可能会根据数据集的不同而有所调整。
阅读全文