不使用sklearn中的决策树方法,编程实现决策树构建算法(建议用python语言),并对鸢尾花数据集构建决策树。
时间: 2024-06-01 09:06:34 浏览: 16
决策树算法是一种基于树形结构来进行决策的方法,它可以对数据进行分类或者回归。下面我将介绍一下决策树的构建算法,并提供一个Python代码实现。
决策树的构建算法大致分为以下几个步骤:
1. 选取最优特征
2. 划分数据集
3. 递归构建决策树
4. 剪枝
具体实现代码如下:
```
import numpy as np
# 定义节点类
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature # 节点分裂的特征
self.threshold = threshold # 特征分裂的阈值
self.left = left # 左子树
self.right = right # 右子树
self.value = value # 叶子节点的值
# 定义CART决策树类
class DecisionTree:
def __init__(self, max_depth=5, min_samples_split=2):
self.max_depth = max_depth # 最大深度
self.min_samples_split = min_samples_split# 最小样本数
self.root = None # 根节点
# 计算基尼系数
def gini(self, y):
_, counts = np.unique(y, return_counts=True)
probs = counts / len(y)
return 1 - np.sum(probs ** 2)
# 计算均方误差
def mse(self, y):
return np.mean((y - np.mean(y)) ** 2)
# 选取最优特征和阈值
def select_feature_threshold(self, X, y):
best_feature, best_threshold, best_gain = None, None, -np.inf
# 遍历所有特征和所有阈值,找到最优的特征和阈值
for feature in range(X.shape):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
mask = X[:, feature] < threshold
y_left, y_right = y[mask], y[~mask]
if len(y_left) < self.min_samples_split or len(y_right) < self.min_samples_split:
continue
gain = self.gain(y, y_left, y_right)
if gain > best_gain:
best_feature, best_threshold, best_gain = feature, threshold, gain
return best_feature, best_threshold
# 计算信息增益
def gain(self, parent, left, right):
w_left, w_right = len(left) / len(parent), len(right) / len(parent)
impurity_left, impurity_right = self.gini(left), self.gini(right)
return self.gini(parent) - (w_left * impurity_left + w_right * impurity_right)
# 构建决策树
def build_tree(self, X, y, depth):
# 如果样本数小于最小样本数或者深度大于最大深度,直接返回叶子节点
if len(y) < self.min_samples_split or depth > self.max_depth:
return Node(value=np.mean(y))
# 选取最优特征和阈值进行分裂
feature, threshold = self.select_feature_threshold(X, y)
# 如果没有合适的分裂点,直接返回叶子节点
if feature is None or threshold is None:
return Node(value=np.mean(y))
mask = X[:, feature] < threshold
X_left, X_right = X[mask], X[~mask]
y_left, y_right = y[mask], y[~mask]
# 如果划分后的样本数小于最小样本数,直接返回叶子节点
if len(X_left) < self.min_samples_split or len(X_right) < self.min_samples_split:
return Node(value=np.mean(y))
# 递归构建左子树和右子树
left = self.build_tree(X_left, y_left, depth+1)
right = self.build_tree(X_right, y_right, depth+1)
return Node(feature=feature, threshold=threshold, left=left, right=right)
# 决策树剪枝
def prune(self, node, X_val, y_val):
if node.left is None and node.right is None:
return
X_val_left = X_val[X_val[:, node.feature] < node.threshold]
X_val_right = X_val[X_val[:, node.feature] >= node.threshold]
y_val_left = y_val[X_val[:, node.feature] < node.threshold]
y_val_right = y_val[X_val[:, node.feature] >= node.threshold]
if node.left is not None:
self.prune(node.left, X_val_left, y_val_left)
if node.right is not None:
self.prune(node.right, X_val_right, y_val_right)
error_no_prune = ((node.left.predict(X_val_left) - y_val_left) ** 2).sum() + \
((node.right.predict(X_val_right) - y_val_right) ** 2).sum()
leaf_value = (y_val.sum() / len(y_val))
error_prune = ((leaf_value - y_val) ** 2).sum()
if error_prune < error_no_prune:
node.left = None
node.right = None
node.value = leaf_value
# 训练决策树模型
def fit(self, X_train, y_train):
self.root = self.build_tree(X_train, y_train, 1)
# 预测单个样本的值
def predict_one(self, x):
node = self.root
while node.left:
if x[node.feature] < node.threshold:
node = node.left
else:
node = node.right
return node.value
# 预测多个样本的值
def predict(self, X_test):
return np.array([self.predict_one(x) for x in X_test])
```
然后我们可以用鸢尾花数据集来训练这个决策树模型:
```
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 加载数据集并划分训练集和测试集
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
# 训练决策树模型并进行预测
model = DecisionTree()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# 输出模型在测试集上的准确率
print("Accuracy:", accuracy_score(y_test, y_pred))
```