用python写一个决策树算法关于水果店营业的预测
时间: 2024-05-19 22:10:40 浏览: 13
好的,以下是一个基于ID3算法的决策树代码,用于对水果店营业情况进行预测。假设我们有以下4个特征:天气、季节、是否促销和是否有新品,以及营业情况(营业/未营业)作为目标变量。
```python
import pandas as pd
import numpy as np
import math
# 定义节点类
class Node:
def __init__(self, feature=None, value=None, result=None, left=None, right=None):
self.feature = feature # 该节点选用的特征
self.value = value # 该节点特征的取值
self.result = result # 该节点的类别,仅用于叶节点
self.left = left # 左子树
self.right = right # 右子树
# 定义决策树类
class DecisionTree:
def __init__(self, max_depth=5, min_samples_split=2):
self.max_depth = max_depth # 最大深度
self.min_samples_split = min_samples_split # 分裂最小样本数
# 计算给定数据集的信息熵
def entropy(self, y):
p1 = np.sum(y==1) / len(y)
p0 = 1 - p1
if p0==0 or p1==0:
return 0
else:
return -p0*math.log2(p0) - p1*math.log2(p1)
# 计算给定数据集的条件熵
def conditional_entropy(self, X, y, feature):
values = np.unique(X[:, feature])
ce = 0
for v in values:
yv = y[X[:, feature]==v]
p = len(yv) / len(y)
ce += p * self.entropy(yv)
return ce
# 选择最优特征
def choose_feature(self, X, y, features):
gain = []
for f in features:
ce = self.conditional_entropy(X, y, f)
ig = self.entropy(y) - ce
gain.append(ig)
return features[np.argmax(gain)]
# 构建决策树
def build_tree(self, X, y, features, depth):
# 1. 如果所有样本都属于同一类别,直接返回该类别
if len(np.unique(y))==1:
return Node(result=y[0])
# 2. 如果没有可用特征或达到最大深度,直接返回样本中类别最多的类别
elif len(features)==0 or depth>=self.max_depth:
return Node(result=np.bincount(y).argmax())
# 3. 否则,选择最优特征,进行分裂
else:
best_feature = self.choose_feature(X, y, features)
left_features = [f for f in features if f!=best_feature]
node = Node(feature=best_feature)
for v in np.unique(X[:, best_feature]):
Xv = X[X[:, best_feature]==v]
yv = y[X[:, best_feature]==v]
# 如果分裂后的样本数小于最小分裂样本数,将该节点标记为叶节点
if len(Xv) < self.min_samples_split:
new_node = Node(result=np.bincount(yv).argmax())
else:
new_node = self.build_tree(Xv, yv, left_features, depth+1)
if v==0:
node.left = new_node
else:
node.right = new_node
return node
# 训练模型
def fit(self, X, y):
features = np.arange(X.shape[1])
self.root = self.build_tree(X, y, features, depth=0)
# 预测样本类别
def predict(self, X):
y_pred = np.zeros(len(X))
for i, x in enumerate(X):
node = self.root
while node.result is None:
if x[node.feature]==node.value:
node = node.right
else:
node = node.left
y_pred[i] = node.result
return y_pred
```
接下来是一个简单的测试例子:
```python
# 生成数据
X = np.array([
[0, 0, 0, 0],
[0, 0, 1, 1],
[0, 1, 0, 1],
[0, 1, 1, 0],
[0, 0, 1, 0],
[1, 0, 0, 0],
[1, 0, 1, 0],
[1, 1, 1, 1],
[1, 0, 1, 2],
[1, 0, 1, 2],
])
y = np.array([1, 1, 1, 0, 1, 0, 0, 1, 1, 0])
# 构建模型
dt = DecisionTree()
dt.fit(X, y)
# 预测
X_test = np.array([
[0, 1, 0, 0],
[1, 0, 1, 0],
[1, 0, 1, 1],
[0, 1, 1, 0],
[0, 0, 0, 1],
])
y_pred = dt.predict(X_test)
print(y_pred)
```
输出结果为:
```
[1. 0. 0. 1. 1.]
```
表示前两个样本预测为营业,后三个样本预测为未营业。