用决策树回归算法对单车数据按基本数据处理流程: 1、初始建模 2、调参(绘制学习曲线调深度) 3、输出两种交叉验证获取模型稳定性结果(包括R方和均方误差) 4、从测试集中提取前100个样本绘制拟合曲线图,每个点需要特别标记出来。
时间: 2024-06-05 11:06:53 浏览: 81
利用决策树求解回归问题
以下是用决策树回归算法对单车数据的代码示例:
# 导入所需模块
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.metrics import r2_score, mean_squared_error
# 读取数据
data = pd.read_csv('bike.csv')
# 划分特征和标签
X = data.iloc[:, 0:11]
y = data.iloc[:, -1]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# 初始建模
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_train, y_train)
# 调参
train_scores, test_scores = learning_curve(dt, X_train, y_train, cv=10, scoring='r2',
train_sizes=np.linspace(0.1, 1.0, 10))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(np.linspace(0.1, 1.0, 10), train_mean, label='Training score')
plt.plot(np.linspace(0.1, 1.0, 10), test_mean, label='Cross-validation score')
plt.fill_between(np.linspace(0.1, 1.0, 10), train_mean - train_std, train_mean + train_std, alpha=0.2)
plt.fill_between(np.linspace(0.1, 1.0, 10), test_mean - test_std, test_mean + test_std, alpha=0.2)
plt.xlabel('Training set size')
plt.ylabel('R2 score')
plt.legend(loc='best')
plt.show()
depths = range(1, 21)
train_scores, test_scores = [], []
for depth in depths:
dt = DecisionTreeRegressor(max_depth=depth, random_state=0)
dt.fit(X_train, y_train)
train_scores.append(dt.score(X_train, y_train))
test_scores.append(dt.score(X_test, y_test))
plt.plot(depths, train_scores, label='Training score')
plt.plot(depths, test_scores, label='Testing score')
plt.xlabel('Depth of tree')
plt.ylabel('R2 score')
plt.legend(loc='best')
plt.show()
# 输出交叉验证结果
cv_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring='r2')
print('Cross-validation R2 scores:', cv_scores)
print('Mean R2 score:', np.mean(cv_scores))
print('Mean squared error:', mean_squared_error(y_test, dt.predict(X_test)))
# 绘制拟合曲线图
y_pred = dt.predict(X_test[:100])
plt.scatter(range(len(y_pred)), y_pred, c='r', label='Prediction')
plt.scatter(range(len(y_test[:100])), y_test[:100], c='b', label='Actual')
plt.xlabel('Sample index')
plt.ylabel('Count')
plt.legend(loc='best')
plt.show()
阅读全文