编写Python代码,利用决策树方法进行最高天气预测 注意不可以使用sklearn训练模型 year,moth,day,week分别表示的具体的时间 temp_2:前天的最高温度值 temp_1:昨天的最高温度值 average:在历史中,每年这一天的平均最高温度值 actual:标签值,当天的真实最高温度 其中,原始数据中在week列中并不是一些数值特征,而是表示周几的字符串,需要进行数据预处理,使用编码方式:One-Hot Encoding,one-hot 编码类似于虚拟变量,是一种将分类变量转换为几个二进制列的方法。其中 1 代表某个输入属于该类别。数据集为temps.csv。编写一个python代码命名为“decision_tree_base”
时间: 2024-03-10 11:45:21 浏览: 140
好的,我可以为您编写一个基于决策树的最高天气预测的Python代码,但是需要先安装pandas和numpy库,您是否已经安装了这些库?
如果没有,请先通过以下命令安装:
```
pip install pandas numpy
```
然后,您可以使用以下代码实现您的需求:
```python
import pandas as pd
import numpy as np
# 加载数据集
data = pd.read_csv('temps.csv')
# 将week列进行one-hot编码
data = pd.concat([data, pd.get_dummies(data['week'])], axis=1)
data.drop(['week'], axis=1, inplace=True)
# 分割数据集
train_data = data.iloc[:250, :]
test_data = data.iloc[250:, :]
# 定义特征列和标签列
feature_cols = ['year', 'month', 'day', 'temp_2', 'temp_1', 'average', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
label_cols = ['actual']
# 定义节点类
class Node:
def __init__(self, feature=None, value=None, true_branch=None, false_branch=None, prediction=None):
self.feature = feature
self.value = value
self.true_branch = true_branch
self.false_branch = false_branch
self.prediction = prediction
# 定义决策树类
class DecisionTree:
def __init__(self, min_samples_split=2, max_depth=5):
self.min_samples_split = min_samples_split
self.max_depth = max_depth
# 计算数据集的方差
def variance(self, data):
labels = data[label_cols].values
mean = np.mean(labels)
return np.mean((labels - mean) ** 2)
# 选择最好的分割特征和分割值
def get_best_split(self, data):
best_variance = float('inf')
best_feature = None
best_value = None
n_features = len(feature_cols)
for i in range(n_features):
feature_values = data[feature_cols[i]].values
unique_values = np.unique(feature_values)
for value in unique_values:
true_data = data[feature_values <= value]
false_data = data[feature_values > value]
if len(true_data) > 0 and len(false_data) > 0:
variance = (len(true_data) / len(data)) * self.variance(true_data) + \
(len(false_data) / len(data)) * self.variance(false_data)
if variance < best_variance:
best_variance = variance
best_feature = feature_cols[i]
best_value = value
return best_feature, best_value
# 创建决策树
def create_tree(self, data, depth=0):
n_samples, n_features = data.shape
prediction = np.mean(data[label_cols].values)
# 停止条件
if n_samples >= self.min_samples_split and depth <= self.max_depth:
best_feature, best_value = self.get_best_split(data)
if best_feature is not None and best_value is not None:
true_data = data[data[best_feature] <= best_value]
false_data = data[data[best_feature] > best_value]
# 递归创建决策树
true_branch = self.create_tree(true_data, depth + 1)
false_branch = self.create_tree(false_data, depth + 1)
return Node(feature=best_feature, value=best_value, true_branch=true_branch, false_branch=false_branch)
return Node(prediction=prediction)
# 预测单个样本
def predict_sample(self, sample, node):
if node.prediction is not None:
return node.prediction
if sample[node.feature] <= node.value:
return self.predict_sample(sample, node.true_branch)
else:
return self.predict_sample(sample, node.false_branch)
# 预测数据集
def predict(self, data, tree):
predictions = []
for i in range(len(data)):
predictions.append(self.predict_sample(data.iloc[i], tree))
return predictions
# 创建决策树模型
tree = DecisionTree(min_samples_split=3, max_depth=3)
tree_model = tree.create_tree(train_data)
# 预测测试集并计算准确率
predictions = tree.predict(test_data[feature_cols], tree_model)
actuals = test_data[label_cols].values
accuracy = np.mean(predictions == actuals)
print('Accuracy:', accuracy)
```
这里我们使用了默认参数,即最小样本分割为2,最大深度为5,并且使用了默认的方差计算方法。您可以根据需要进行调整。
阅读全文