c4.5Python实现
时间: 2023-11-19 09:57:08 浏览: 35
是C4.5算法的Python实现:
```python
import math
import pandas as pd
class Node:
def __init__(self, feature=None, threshold=None, label=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.threshold = threshold # 划分数据集的特征阈值
self.label = label # 叶子节点的类别标签
self.left = left # 左子树
self.right = right # 右子树
class C45DecisionTree:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon # 停止划分的阈值
self.tree = None # 决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
def predict(self, X):
return [self._predict_one(x) for x in X]
def _build_tree(self, X, y):
# 如果所有样本属于同一类别,返回叶子节点
if len(set(y)) == 1:
return Node(label=y[0])
# 如果没有特征可用于划分,返回叶子节点
if len(X.columns) == 0:
return Node(label=self._majority_class(y))
# 选择最优特征和阈值
best_feature, best_threshold = self._choose_best_feature(X, y)
# 如果最优特征的信息增益小于阈值,返回叶子节点
if best_feature is None or best_threshold is None:
return Node(label=self._majority_class(y))
# 根据最优特征和阈值划分数据集
left_idx = X[best_feature] <= best_threshold
right_idx = X[best_feature] > best_threshold
left = self._build_tree(X[left_idx], y[left_idx])
right = self._build_tree(X[right_idx], y[right_idx])
# 返回当前节点
return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)
def _choose_best_feature(self, X, y):
best_feature, best_threshold, max_gain_ratio = None, None, -1
# 计算信息增益和每个特征的分裂信息
entropy = self._entropy(y)
for feature in X.columns:
if X[feature].dtype == 'object': # 离散特征
gain_ratio, split_info = self._discrete_gain_ratio(X[feature], y, entropy)
if gain_ratio > max_gain_ratio:
best_feature, best_threshold, max_gain_ratio = feature, None, gain_ratio
else: # 连续特征
gain_ratio, threshold = self._continuous_gain_ratio(X[feature], y, entropy)
if gain_ratio > max_gain_ratio:
best_feature, best_threshold, max_gain_ratio = feature, threshold, gain_ratio
# 如果信息增益小于阈值,返回None
if max_gain_ratio < self.epsilon:
return None, None
return best_feature, best_threshold
def _discrete_gain_ratio(self, feature, y, entropy):
# 计算信息增益
gain = entropy - sum([(sum(y == c) / len(y)) * self._entropy(y[feature == c]) for c in set(feature)])
# 计算分裂信息
split_info = -sum([(sum(feature == c) / len(feature)) * math.log2(sum(feature == c) / len(feature)) for c in set(feature)])
# 计算信息增益率
gain_ratio = gain / split_info
return gain_ratio, split_info
def _continuous_gain_ratio(self, feature, y, entropy):
# 对特征值进行排序
sorted_idx = feature.argsort()
sorted_feature, sorted_y = feature[sorted_idx], y[sorted_idx]
# 遍历每个特征值,计算信息增益率
thresholds, classes = sorted_feature[:-1], (sorted_y[:-1] + sorted_y[1:]) / 2
gains = [entropy - sum([(sum(y[feature <= t] == c) / len(y[feature <= t])) * self._entropy(y[feature <= t][y[feature <= t] == c]) +
(sum(y[feature > t] == c) / len(y[feature > t])) * self._entropy(y[feature > t][y[feature > t] == c])
for c in set(y)]) for t in thresholds]
gain_ratios = gains / self._split_info(classes)
# 找到信息增益率最大的特征值
best_idx = np.argmax(gain_ratios)
return gain_ratios[best_idx], thresholds[best_idx]
def _split_info(self, classes):
# 计算分裂信息
return -sum([(sum(classes == c) / len(classes)) * math.log2(sum(classes == c) / len(classes)) for c in set(classes)])
def _entropy(self, y):
# 计算信息熵
return -sum([(sum(y == c) / len(y)) * math.log2(sum(y == c) / len(y)) for c in set(y)])
def _majority_class(self, y):
# 返回样本数最多的类别
return pd.Series(y).mode()[0]
def _predict_one(self, x):
# 遍历决策树,直到叶子节点
node = self.tree
while node.label is None:
if x[node.feature] <= node.threshold:
node = node.left
else:
node = node.right
return node.label
```