sklearn机器学习模板
时间: 2023-09-07 21:11:11 浏览: 36
# sklearn机器学习模板
## 1. 导入库和数据
```python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# 读取数据
data = pd.read_csv('data.csv')
# 划分特征变量和目标变量
X = data.drop('target', axis=1)
y = data['target']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
```
## 2. 特征工程
```python
# 标准化特征变量
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
```
## 3. 训练模型
```python
# 逻辑回归模型
lr = LogisticRegression()
lr.fit(X_train, y_train)
# 决策树模型
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
# 随机森林模型
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
```
## 4. 模型评估
```python
# 预测结果
y_pred_lr = lr.predict(X_test)
y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)
# 准确率
print('Logistic Regression:', accuracy_score(y_test, y_pred_lr))
print('Decision Tree:', accuracy_score(y_test, y_pred_dt))
print('Random Forest:', accuracy_score(y_test, y_pred_rf))
# 混淆矩阵
print('Logistic Regression:\n', confusion_matrix(y_test, y_pred_lr))
print('Decision Tree:\n', confusion_matrix(y_test, y_pred_dt))
print('Random Forest:\n', confusion_matrix(y_test, y_pred_rf))
# 分类报告
print('Logistic Regression:\n', classification_report(y_test, y_pred_lr))
print('Decision Tree:\n', classification_report(y_test, y_pred_dt))
print('Random Forest:\n', classification_report(y_test, y_pred_rf))
```
## 5. 调参
```python
from sklearn.model_selection import GridSearchCV
# 决策树模型调参
param_grid = {'max_depth': [2, 4, 6, 8, 10],
'min_samples_split': [2, 4, 6, 8, 10],
'min_samples_leaf': [1, 2, 3, 4, 5]}
dt = DecisionTreeClassifier()
grid_search = GridSearchCV(dt, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)
```
## 6. 模型保存
```python
import pickle
# 保存模型
with open('model.pkl', 'wb') as f:
pickle.dump(rf, f)
# 加载模型
with open('model.pkl', 'rb') as f:
model = pickle.load(f)
# 使用模型进行预测
y_pred = model.predict(X_test)
```