手写决策树
时间: 2023-07-20 09:06:50 浏览: 77
决策树
好的,以下是一个简单的手写决策树的示例代码:
```python
# 导入必要的库
import numpy as np
from collections import Counter
class DecisionTree:
def __init__(self, max_depth=5, min_samples_leaf=2):
self.max_depth = max_depth # 最大深度
self.min_samples_leaf = min_samples_leaf # 叶子节点最少的样本数
self.tree = None # 保存训练后的决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
def predict(self, X):
# 预测单个样本
if len(X.shape) == 1:
return self._predict_one(X, self.tree)
# 预测多个样本
else:
return np.array([self._predict_one(x, self.tree) for x in X])
def _build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
# 如果达到最大深度或叶子节点样本数不足,则返回叶子节点,使用类别中最多的作为该节点的预测结果
if depth == self.max_depth or n_samples < self.min_samples_leaf:
return Counter(y).most_common(1)[0][0]
# 选择最优特征
best_feature, best_threshold = self._choose_best_split(X, y)
# 如果无法继续划分,则返回叶子节点,使用类别中最多的作为该节点的预测结果
if best_feature is None:
return Counter(y).most_common(1)[0][0]
# 根据最优特征和阈值将数据集划分为左右子树
left_idxs = X[:, best_feature] < best_threshold
right_idxs = X[:, best_feature] >= best_threshold
# 递归构建左右子树
left_tree = self._build_tree(X[left_idxs], y[left_idxs], depth + 1)
right_tree = self._build_tree(X[right_idxs], y[right_idxs], depth + 1)
# 返回该节点
return {'feature': best_feature,
'threshold': best_threshold,
'left': left_tree,
'right': right_tree}
def _choose_best_split(self, X, y):
best_feature = None
best_threshold = None
best_gini = 1.0
n_samples, n_features = X.shape
# 计算原始数据集的基尼指数
gini = self._gini(y)
# 遍历每个特征
for feature in range(n_features):
# 获取该特征的所有取值
feature_values = X[:, feature]
# 遍历该特征的所有取值,以此作为阈值进行二分类
for threshold in feature_values:
# 根据阈值将数据集划分为左右两部分
left_idxs = X[:, feature] < threshold
right_idxs = X[:, feature] >= threshold
# 如果左右子集中有一个为空,则跳过
if len(y[left_idxs]) == 0 or len(y[right_idxs]) == 0:
continue
# 计算左右子集的基尼指数
left_gini = self._gini(y[left_idxs])
right_gini = self._gini(y[right_idxs])
# 计算加权基尼指数
weighted_gini = (len(y[left_idxs]) / n_samples) * left_gini + (len(y[right_idxs]) / n_samples) * right_gini
# 如果加权基尼指数比当前最优值更优,则更新最优值
if weighted_gini < best_gini:
best_feature = feature
best_threshold = threshold
best_gini = weighted_gini
# 返回最优特征和阈值
return best_feature, best_threshold
def _gini(self, y):
# 计算基尼指数
_, counts = np.unique(y, return_counts=True)
proportions = counts / len(y)
gini = 1 - np.sum(proportions ** 2)
return gini
def _predict_one(self, x, tree):
# 如果当前节点是叶子节点,则返回该节点的预测结果
if isinstance(tree, str) or isinstance(tree, int):
return tree
# 否则,根据当前节点的划分规则选择左右子树继续递归预测
if x[tree['feature']] < tree['threshold']:
return self._predict_one(x, tree['left'])
else:
return self._predict_one(x, tree['right'])
```
以上代码实现了一个二分类决策树,采用的是 CART 算法,使用基尼指数作为划分标准,可以通过调整 `max_depth` 和 `min_samples_leaf` 参数来控制决策树的复杂度和泛化性能。
阅读全文