用python代码实现决策树算法
时间: 2023-07-20 11:32:30 浏览: 235
好的,下面是用Python实现决策树算法的代码实现。
决策树是一种经典的分类和回归算法,它通过对数据进行递归分割,构建起一棵决策树,从而实现对新数据的分类或回归预测。决策树的每个节点都代表一个特征,在该特征的取值范围内进行分割,直到满足某个停止条件为止。
下面是使用Python实现CART算法的决策树示例代码:
```python
import numpy as np
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature # 选用的特征
self.threshold = threshold # 特征分裂的阈值
self.left = left # 左子树
self.right = right # 右子树
self.value = value # 叶子节点的预测值
class DecisionTree:
def __init__(self, max_depth=None, min_samples_split=2, min_impurity=1e-7):
self.max_depth = max_depth # 最大深度
self.min_samples_split = min_samples_split # 最小样本数
self.min_impurity = min_impurity # 最小纯度
def fit(self, X, y):
self.n_classes = len(set(y))
self.n_features = X.shape[1]
self.tree = self._grow_tree(X, y)
def predict(self, X):
return [self._predict(inputs) for inputs in X]
def _grow_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_labels = [np.sum(y == c) for c in range(self.n_classes)]
label = np.argmax(n_labels)
# 如果满足停止条件,返回叶子节点
if depth == self.max_depth or n_samples < self.min_samples_split \
or np.max(n_labels) / float(n_samples) >= self.min_impurity:
return Node(value=label)
# 选择最佳特征用于分裂
feat_idxs = np.random.choice(n_features, int(np.sqrt(n_features)), replace=False)
best_feat, best_thresh = self._best_split(X, y, feat_idxs)
# 分裂左右子树
left_idxs = np.argwhere(X[:, best_feat] <= best_thresh).flatten()
right_idxs = np.argwhere(X[:, best_feat] > best_thresh).flatten()
left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth=depth+1)
right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth=depth+1)
return Node(best_feat, best_thresh, left, right)
def _best_split(self, X, y, feat_idxs):
best_gain = -1
split_idx, split_thresh = None, None
for i in feat_idxs:
thresholds = np.unique(X[:, i])
for thresh in thresholds:
gain = self._information_gain(y, X[:, i], thresh)
if gain > best_gain:
best_gain = gain
split_idx = i
split_thresh = thresh
return split_idx, split_thresh
def _information_gain(self, y, X_feat, split_thresh):
parent_entropy = self._entropy(y)
left_idxs = np.argwhere(X_feat <= split_thresh).flatten()
right_idxs = np.argwhere(X_feat > split_thresh).flatten()
if len(left_idxs) == 0 or len(right_idxs) == 0:
return 0
n = len(y)
n_l, n_r = len(left_idxs), len(right_idxs)
e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
ig = parent_entropy - child_entropy
return ig
def _entropy(self, y):
hist = np.bincount(y)
ps = hist / np.sum(hist)
return -np.sum([p * np.log2(p) for p in ps if p > 0])
def _predict(self, inputs):
node = self.tree
while node.value is None:
if inputs[node.feature] <= node.threshold:
node = node.left
else:
node = node.right
return node.value
```
这里使用了numpy库进行矩阵计算,实现了决策树的训练和预测功能。其中,_grow_tree函数用于递归构建决策树,_best_split函数用于选择最佳特征进行分裂,_information_gain函数用于计算信息增益,_entropy函数用于计算熵。
使用示例:
```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTree(max_depth=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
```
这里使用sklearn库中的鸢尾花数据集进行测试,将数据集分成训练集和测试集,使用上面实现的DecisionTree模型进行训练和预测,最后输出准确率。
阅读全文