用python实现的决策树算法
时间: 2023-07-20 17:32:31 浏览: 44
好的,下面是用Python实现的决策树算法,以ID3算法为例:
```python
import numpy as np
from collections import Counter
class ID3DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
def fit(self, X, y):
self.n_features_ = X.shape[1]
self.tree_ = self._grow_tree(X, y)
def predict(self, X):
return [self._predict(inputs) for inputs in X]
def _predict(self, inputs):
node = self.tree_
while node.is_leaf_node() == False:
if inputs[node.feature_] <= node.threshold_:
node = node.left_
else:
node = node.right_
return node.value_
def _grow_tree(self, X, y, depth=0):
num_samples_per_class = [np.sum(y == i) for i in range(len(set(y)))]
predicted_class = np.argmax(num_samples_per_class)
node = Node(predicted_class=predicted_class)
if depth < self.max_depth:
feature, threshold = self._best_split(X, y)
if feature is not None:
indices_left = X[:, feature] <= threshold
X_left, y_left = X[indices_left], y[indices_left]
X_right, y_right = X[~indices_left], y[~indices_left]
node = Node(feature=feature, threshold=threshold)
node.left_ = self._grow_tree(X_left, y_left, depth+1)
node.right_ = self._grow_tree(X_right, y_right, depth+1)
return node
def _best_split(self, X, y):
best_gain = -1
split_feature, threshold = None, None
n_samples, n_features = X.shape
entropy_parent = self._entropy(y)
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
gain = self._information_gain(X, y, feature, threshold, entropy_parent)
if gain > best_gain:
best_gain = gain
split_feature = feature
split_threshold = threshold
return split_feature, split_threshold
def _information_gain(self, X, y, split_feature, split_threshold, entropy_parent):
indices_left = X[:, split_feature] <= split_threshold
y_left, y_right = y[indices_left], y[~indices_left]
entropy_left = self._entropy(y_left)
entropy_right = self._entropy(y_right)
n_total = len(y_left) + len(y_right)
weight_left, weight_right = len(y_left) / n_total, len(y_right) / n_total
information_gain = entropy_parent - (weight_left*entropy_left + weight_right*entropy_right)
return information_gain
def _entropy(self, y):
_, counts = np.unique(y, return_counts=True)
probabilities = counts / np.sum(counts)
entropy = np.sum(probabilities * -np.log2(probabilities))
return entropy
class Node:
def __init__(self, feature=None, threshold=None, predicted_class=None):
self.feature_ = feature
self.threshold_ = threshold
self.predicted_class_ = predicted_class
self.left_ = None
self.right_ = None
def is_leaf_node(self):
return self.predicted_class_ is not None
@property
def value_(self):
return self.predicted_class_
```
以上代码中,首先定义了一个ID3DecisionTree类,初始化时可以传入最大深度。fit方法用于训练模型,传入训练数据集X和标签y。predict方法用于预测,传入测试数据集X,返回预测结果。_grow_tree方法用于生长决策树,传入当前节点的数据集X和标签y,以及当前树的深度depth。_predict方法用于对于单个样本进行预测。_best_split方法用于找到最佳分裂特征和阈值。_information_gain方法用于计算信息增益。_entropy方法用于计算熵。Node类用于表示决策树的节点,其中包含属性feature_、threshold_、predicted_class_、left_和right_,分别表示特征、阈值、预测类别、左子树和右子树。