请严格按照上述要求,给出完整正确的python代码
时间: 2024-10-22 11:07:13 浏览: 16
python基本代码教程-Python基础.pdf
以下是按照 `Lab6-instructions.pdf` 中的要求编写的完整 Python 代码。代码分为几个部分:数据加载和探索、数据预处理、模型构建、模型评估、超参数调优和交叉验证。
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns
# 1. 数据加载和探索
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
'age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]
data = pd.read_csv(url, header=None, names=column_names)
print("前5行数据:")
print(data.head())
print("\n数据描述:")
print(data.describe())
print("\n数据信息:")
print(data.info())
# 可视化关键特征分布
plt.figure(figsize=(12, 6))
sns.histplot(data['age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()
plt.figure(figsize=(12, 6))
sns.countplot(x='income', data=data)
plt.title('Income Distribution')
plt.show()
# 2. 数据预处理
# 处理缺失值
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)
# 分离特征和目标变量
X = data.drop(columns=['income'])
y = data['income']
# 定义数值型和类别型特征
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
# 创建预处理器
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
# 3. 模型构建
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 创建管道
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier(random_state=42))
])
# 训练决策树分类器
pipeline.fit(X_train, y_train)
# 可视化决策树
plt.figure(figsize=(20, 10))
plot_tree(pipeline.named_steps['classifier'], filled=True, feature_names=pipeline.named_steps['preprocessor'].get_feature_names_out())
plt.show()
# 4. 模型评估
# 预测测试集
y_pred = pipeline.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
# 生成混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# 计算其他指标
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
# 绘制ROC曲线并计算AUC
y_prob = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
# 5. 超参数调优
param_grid = {
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_depth': [None, 10, 20, 30],
'classifier__min_samples_split': [2, 5, 10],
'classifier__min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
# 获取最佳参数
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
# 使用最佳参数重新训练模型
best_pipeline = grid_search.best_estimator_
y_pred_best = best_pipeline.predict(X_test)
# 评估性能
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Accuracy with Best Parameters: {accuracy_best:.4f}")
# 6. 交叉验证
cv_scores = cross_val_score(best_pipeline, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average Accuracy: {np.mean(cv_scores):.4f}")
```
### 注意事项
1. **数据路径**:如果数据文件不在网络上,需要将 `url` 替换为本地文件路径。
2. **环境配置**:确保安装了所有必要的库,如 `pandas`, `numpy`, `scikit-learn`, `matplotlib`, 和 `seaborn`。
3. **运行时间**:超参数调优可能需要较长时间,特别是在较大的数据集上。
希望这段代码能帮助你完成实验任务!如果有任何问题或需要进一步的帮助,请随时告诉我。
阅读全文