C5.0 python代码实现
时间: 2023-10-13 10:24:26 浏览: 333
以下是C5.0决策树的python代码实现:
```python
import pandas as pd
import numpy as np
import math
class C50Tree:
def __init__(self, max_depth=10, min_samples_split=2, min_samples_leaf=1,
max_features=None, sample_size=None, random_state=None):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.max_features = max_features
self.sample_size = sample_size
self.random_state = random_state
self.tree = None
def fit(self, X, y):
self.tree = self._build_tree(X, y, depth=0)
def predict(self, X):
return np.array([self._predict_one(x, self.tree) for x in X])
def _build_tree(self, X, y, depth):
if depth == self.max_depth or len(X) < self.min_samples_split:
return self._make_leaf(y)
best_feature, best_threshold = self._choose_split(X, y)
if best_feature is None:
return self._make_leaf(y)
left_idxs = X[:, best_feature] < best_threshold
right_idxs = X[:, best_feature] >= best_threshold
if sum(left_idxs) < self.min_samples_leaf or sum(right_idxs) < self.min_samples_leaf:
return self._make_leaf(y)
left_tree = self._build_tree(X[left_idxs], y[left_idxs], depth+1)
right_tree = self._build_tree(X[right_idxs], y[right_idxs], depth+1)
return {'feature': best_feature,
'threshold': best_threshold,
'left': left_tree,
'right': right_tree}
def _choose_split(self, X, y):
num_features = X.shape[1]
if self.max_features is None:
self.max_features = int(math.sqrt(num_features))
if self.sample_size is None:
self.sample_size = len(X)
if self.random_state is not None:
np.random.seed(self.random_state)
feature_idxs = np.random.choice(range(num_features), size=self.max_features, replace=False)
best_feature, best_threshold, best_info_gain = None, None, -1
for feature_idx in feature_idxs:
col = X[:, feature_idx]
thresholds = np.unique(col)
for threshold in thresholds:
left_idxs = col < threshold
right_idxs = col >= threshold
info_gain = self._information_gain(y, y[left_idxs], y[right_idxs])
if info_gain > best_info_gain:
best_feature = feature_idx
best_threshold = threshold
best_info_gain = info_gain
return best_feature, best_threshold
def _information_gain(self, y, y_left, y_right):
return self._entropy(y) - ((len(y_left)/len(y)) * self._entropy(y_left)
+ (len(y_right)/len(y)) * self._entropy(y_right))
def _entropy(self, y):
_, counts = np.unique(y, return_counts=True)
p = counts / len(y)
return -np.sum(p * np.log2(p))
def _make_leaf(self, y):
_, counts = np.unique(y, return_counts=True)
return {'leaf': True,
'class_counts': dict(zip(_, counts)),
'class': _[np.argmax(counts)]}
def _predict_one(self, x, tree):
if tree['leaf']:
return tree['class']
if x[tree['feature']] < tree['threshold']:
return self._predict_one(x, tree['left'])
else:
return self._predict_one(x, tree['right'])
```
其中的`_build_tree`方法是核心方法,它递归地构建决策树。在每次递归中,它首先判断是否达到了最大深度或最小样本数,如果是,则返回一个叶节点,该叶节点的类别是样本中出现最多的类别。否则,它会选择最佳分裂特征和最佳分裂阈值,并根据这个特征和阈值将数据集分成左子树和右子树。然后它递归地调用自己来构建左子树和右子树,并将它们作为左子树和右子树返回。`_choose_split`方法选择最佳分裂特征和最佳分裂阈值。它首先随机选择一些特征子集,然后在这些特征中选择最佳分裂特征和最佳分裂阈值。`_information_gain`方法计算信息增益,`_entropy`方法计算熵。`_make_leaf`方法创建一个叶节点。`_predict_one`方法对单个样本进行预测。
阅读全文