cart分类树的代码实现
时间: 2023-10-06 09:09:10 浏览: 76
下面是一个简单的 CART分类树的Python代码实现示例:
```python
import numpy as np
class CARTDecisionTree:
def __init__(self, max_depth=5, min_samples_split=2, min_samples_leaf=1):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
def fit(self, X, y):
self.n_classes_ = len(np.unique(y))
self.tree_ = self._grow_tree(X, y)
def predict(self, X):
return [self._predict(inputs) for inputs in X]
def _grow_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_labels = len(np.unique(y))
# 终止条件
if (depth >= self.max_depth or n_labels == 1 or
n_samples < self.min_samples_split):
return LeafNode(y)
# 找到最优切分特征和切分点
best_feat, best_val = self._best_split(X, y, n_samples, n_features)
# 如果没有合适的切分特征,则返回叶节点
if best_feat is None:
return LeafNode(y)
# 对数据集进行切分
left_idxs = X[:, best_feat] < best_val
right_idxs = X[:, best_feat] >= best_val
left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
return DecisionNode(best_feat, best_val, left, right)
def _best_split(self, X, y, n_samples, n_features):
best_gain = -1
best_feat = None
best_val = None
# 计算基尼指数
impurity = self._gini(y)
# 遍历所有特征,找到最优切分特征和切分点
for feat in range(n_features):
thresholds = np.unique(X[:, feat])
for val in thresholds:
left_idxs = X[:, feat] < val
right_idxs = X[:, feat] >= val
n_left, n_right = sum(left_idxs), sum(right_idxs)
if n_left == 0 or n_right == 0:
continue
gain = impurity - (n_left/n_samples*self._gini(y[left_idxs]) +
n_right/n_samples*self._gini(y[right_idxs]))
if gain > best_gain:
best_gain = gain
best_feat = feat
best_val = val
# 如果最优增益小于阈值,则返回None
if best_gain < self.min_samples_leaf:
return None, None
return best_feat, best_val
def _gini(self, y):
n_samples = len(y)
gini = 1.0
for label in range(self.n_classes_):
p = sum(y == label) / n_samples
gini -= p ** 2
return gini
def _predict(self, inputs):
node = self.tree_
while isinstance(node, DecisionNode):
if inputs[node.feature] < node.threshold:
node = node.left
else:
node = node.right
return node.value
class DecisionNode:
def __init__(self, feature, threshold, left, right):
self.feature = feature
self.threshold = threshold
self.left = left
self.right = right
class LeafNode:
def __init__(self, y):
self.value = np.bincount(y).argmax()
```
这段代码实现了一个基于CART算法的分类树,其中使用基尼指数作为划分标准,实现了树的生长、预测等功能。
阅读全文