决策树算法实现智能数据学习详细代码
时间: 2023-10-26 19:20:10 浏览: 78
面向对象程序设计课程设计:利用决策树判断西瓜质量(源代码)
决策树算法可以用来进行分类或者回归任务,下面分别给出分类和回归任务的决策树算法实现。
分类决策树算法实现:
```python
import numpy as np
class Node:
def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
self.feature_index = feature_index # 特征索引
self.threshold = threshold # 分割阈值
self.left = left # 左子树
self.right = right # 右子树
self.info_gain = info_gain # 信息增益
self.value = value # 叶子节点的类别值
class DecisionTreeClassifier:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth # 决策树最大深度
self.min_samples_split = min_samples_split # 分割所需最小样本数
self.root = None # 决策树根节点
def fit(self, X, y):
self.root = self._grow_tree(X, y)
def _grow_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_labels = len(np.unique(y))
# 如果样本数小于所需最小样本数或者树达到最大深度,返回叶子节点
if (n_samples < self.min_samples_split) or (self.max_depth is not None and depth >= self.max_depth):
return Node(value=self._most_common_label(y))
# 计算每个特征的信息增益,并选择最好的特征进行分割
best_feature, best_threshold, best_info_gain = self._best_criteria(X, y)
# 如果信息增益小于等于0,则返回叶子节点
if best_info_gain == 0:
return Node(value=self._most_common_label(y))
# 递归构建左右子树
left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
left = self._grow_tree(X[left_indices, :], y[left_indices], depth+1)
right = self._grow_tree(X[right_indices, :], y[right_indices], depth+1)
return Node(best_feature, best_threshold, left, right, best_info_gain)
def _best_criteria(self, X, y):
best_info_gain = -1
best_feature = None
best_threshold = None
# 在所有特征中选择最好的分割点
for feature_index in range(X.shape[1]):
# 计算当前特征的信息增益和最佳分割点
feature_values = X[:, feature_index]
thresholds = np.unique(feature_values)
for threshold in thresholds:
info_gain = self._information_gain(y, feature_values, threshold)
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = feature_index
best_threshold = threshold
return best_feature, best_threshold, best_info_gain
def _information_gain(self, y, X_feature, threshold):
parent_entropy = self._entropy(y)
left_indices, right_indices = self._split(X_feature, threshold)
if len(left_indices) == 0 or len(right_indices) == 0:
return 0
n = len(y)
n_l, n_r = len(left_indices), len(right_indices)
e_l, e_r = self._entropy(y[left_indices]), self._entropy(y[right_indices])
child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
return parent_entropy - child_entropy
def _entropy(self, y):
n_labels = len(np.unique(y))
if n_labels <= 1:
return 0
counts = np.bincount(y)
probabilities = counts / len(y)
entropy = -np.sum([p * np.log2(p) for p in probabilities if p > 0])
return entropy
def _split(self, X_feature, threshold):
left_indices = np.argwhere(X_feature <= threshold).flatten()
right_indices = np.argwhere(X_feature > threshold).flatten()
return left_indices, right_indices
def _most_common_label(self, y):
return np.bincount(y).argmax()
def predict(self, X):
return np.array([self._traverse_tree(x, self.root) for x in X])
def _traverse_tree(self, x, node):
if node.value is not None:
return node.value
if x[node.feature_index] <= node.threshold:
return self._traverse_tree(x, node.left)
else:
return self._traverse_tree(x, node.right)
```
回归决策树算法实现:
```python
import numpy as np
class Node:
def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
self.feature_index = feature_index # 特征索引
self.threshold = threshold # 分割阈值
self.left = left # 左子树
self.right = right # 右子树
self.value = value # 叶子节点的预测值
class DecisionTreeRegressor:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth # 决策树最大深度
self.min_samples_split = min_samples_split # 分割所需最小样本数
self.root = None # 决策树根节点
def fit(self, X, y):
self.root = self._grow_tree(X, y)
def _grow_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
# 如果样本数小于所需最小样本数或者树达到最大深度,返回叶子节点
if (n_samples < self.min_samples_split) or (self.max_depth is not None and depth >= self.max_depth):
return Node(value=np.mean(y))
# 计算每个特征的最佳分割点,并选择最好的特征进行分割
best_feature, best_threshold = self._best_criteria(X, y)
# 递归构建左右子树
left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
left = self._grow_tree(X[left_indices, :], y[left_indices], depth+1)
right = self._grow_tree(X[right_indices, :], y[right_indices], depth+1)
return Node(best_feature, best_threshold, left, right)
def _best_criteria(self, X, y):
best_mse = np.inf
best_feature = None
best_threshold = None
# 在所有特征中选择最好的分割点
for feature_index in range(X.shape[1]):
# 计算当前特征的最佳分割点
feature_values = X[:, feature_index]
thresholds = np.unique(feature_values)
for threshold in thresholds:
mse = self._mse(y, feature_values, threshold)
if mse < best_mse:
best_mse = mse
best_feature = feature_index
best_threshold = threshold
return best_feature, best_threshold
def _mse(self, y, X_feature, threshold):
left_indices, right_indices = self._split(X_feature, threshold)
if len(left_indices) == 0 or len(right_indices) == 0:
return np.inf
y_left, y_right = y[left_indices], y[right_indices]
mse = (len(y_left) * np.var(y_left) + len(y_right) * np.var(y_right)) / len(y)
return mse
def _split(self, X_feature, threshold):
left_indices = np.argwhere(X_feature <= threshold).flatten()
right_indices = np.argwhere(X_feature > threshold).flatten()
return left_indices, right_indices
def predict(self, X):
return np.array([self._traverse_tree(x, self.root) for x in X])
def _traverse_tree(self, x, node):
if node.value is not None:
return node.value
if x[node.feature_index] <= node.threshold:
return self._traverse_tree(x, node.left)
else:
return self._traverse_tree(x, node.right)
```
阅读全文