用python实现DecisionTreeClassifier但不能使用sklearn
时间: 2024-04-07 17:07:52 浏览: 44
好的,我可以给你提供一个简单的决策树分类器的实现。首先,我们需要定义一个节点类来表示决策树节点,如下所示:
```python
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature # 分裂特征
self.threshold = threshold # 分裂阈值
self.left = left # 左子树
self.right = right # 右子树
self.value = value # 叶子节点的预测值
```
接下来,我们需要定义一个决策树分类器类,如下所示:
```python
class DecisionTreeClassifier:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth # 最大深度
self.min_samples_split = min_samples_split # 最小分裂样本数
self.root = None # 根节点
def fit(self, X, y):
self.root = self._build_tree(X, y)
def predict(self, X):
y_pred = []
for sample in X:
node = self.root
while node.left:
if sample[node.feature] < node.threshold:
node = node.left
else:
node = node.right
y_pred.append(node.value)
return y_pred
def _build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_labels = len(set(y))
# 如果样本数小于最小分裂样本数,或者样本标签已经纯净
if n_samples < self.min_samples_split or n_labels == 1:
return Node(value=self._most_common_label(y))
# 如果达到最大深度,或者没有特征可分裂
if self.max_depth is not None and depth >= self.max_depth or n_features == 0:
return Node(value=self._most_common_label(y))
# 找到最佳分裂特征和阈值
best_feature, best_threshold = self._find_best_split(X, y)
# 如果不能分裂,则返回叶子节点
if best_feature is None or best_threshold is None:
return Node(value=self._most_common_label(y))
# 分裂数据集
left_indices = X[:, best_feature] < best_threshold
right_indices = X[:, best_feature] >= best_threshold
left_X, left_y = X[left_indices], y[left_indices]
right_X, right_y = X[right_indices], y[right_indices]
# 递归构建左右子树
left = self._build_tree(left_X, left_y, depth + 1)
right = self._build_tree(right_X, right_y, depth + 1)
# 返回当前节点
return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)
def _find_best_split(self, X, y):
best_feature, best_threshold = None, None
best_gini = 1
# 遍历每个特征
for feature in range(X.shape[1]):
thresholds = sorted(set(X[:, feature]))
# 遍历每个阈值
for i in range(1, len(thresholds)):
threshold = (thresholds[i - 1] + thresholds[i]) / 2
# 计算基尼指数
left_indices = X[:, feature] < threshold
right_indices = X[:, feature] >= threshold
n_left, n_right = sum(left_indices), sum(right_indices)
gini_left = self._gini(y[left_indices]) if n_left > 0 else 0
gini_right = self._gini(y[right_indices]) if n_right > 0 else 0
gini = (n_left / len(y)) * gini_left + (n_right / len(y)) * gini_right
# 更新最佳分裂特征和阈值
if gini < best_gini:
best_feature, best_threshold = feature, threshold
best_gini = gini
return best_feature, best_threshold
def _gini(self, y):
_, counts = np.unique(y, return_counts=True)
probs = counts / len(y)
return 1 - sum(probs ** 2)
def _most_common_label(self, y):
return Counter(y).most_common(1)[0][0]
```
这个决策树分类器使用基尼指数作为分裂准则,支持限制最大深度和最小分裂样本数。在 `fit` 方法中,它使用输入的训练数据 `X` 和标签 `y` 来训练决策树,并在 `predict` 方法中使用训练好的决策树对输入的数据 `X` 进行预测。
阅读全文