python实现C4.5算法的代码
时间: 2023-10-12 10:15:10 浏览: 97
以下是一个使用Python实现C4.5算法的简单示例代码:
```python
import math
from collections import Counter
class Node:
def __init__(self, feature=None, value=None, results=None, left=None, right=None):
self.feature = feature # 划分属性
self.value = value # 划分属性取值
self.results = results # 叶节点的分类结果
self.left = left # 左子节点
self.right = right # 右子节点
class DecisionTree:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon # 停止划分的阈值
self.tree = None # 决策树的根节点
def fit(self, X, y, features):
self.tree = self.build_tree(X, y, features)
def build_tree(self, X, y, features):
# 如果样本集为空,则返回空节点
if not X:
return Node()
# 如果样本集中的所有实例属于同一类,则返回叶节点
if len(set(y)) == 1:
return Node(results=y[0])
# 如果属性集为空,则返回叶节点,类别为样本集中实例数最多的类别
if not features:
return Node(results=Counter(y).most_common(1)[0][0])
# 计算样本集中每个属性的信息增益,并选择信息增益最大的属性作为划分属性
best_feature, best_gain_ratio = self.choose_best_feature(X, y, features)
# 如果信息增益小于阈值,则返回叶节点,类别为样本集中实例数最多的类别
if best_gain_ratio < self.epsilon:
return Node(results=Counter(y).most_common(1)[0][0])
# 构建决策树
tree = Node(feature=best_feature)
left_X, left_y, right_X, right_y = self.split_data_set(X, y, best_feature)
tree.left = self.build_tree(left_X, left_y, [f for f in features if f != best_feature])
tree.right = self.build_tree(right_X, right_y, [f for f in features if f != best_feature])
return tree
def choose_best_feature(self, X, y, features):
base_entropy = self.calc_entropy(y)
best_feature = None
best_gain_ratio = 0.0
for feature in features:
# 如果该属性为连续值,则将其离散化
if isinstance(X[0][feature], float):
values = sorted(set([x[feature] for x in X]))
split_points = [(values[i] + values[i+1]) / 2 for i in range(len(values)-1)]
gain_ratio = self.calc_continuous_attribute_gain_ratio(X, y, feature, split_points, base_entropy)
else:
gain_ratio = self.calc_discrete_attribute_gain_ratio(X, y, feature, base_entropy)
if gain_ratio > best_gain_ratio:
best_feature = feature
best_gain_ratio = gain_ratio
return best_feature, best_gain_ratio
def calc_discrete_attribute_gain_ratio(self, X, y, feature, base_entropy):
# 计算信息增益
entropy = 0.0
sub_sets = {}
for xi, yi in zip(X, y):
if xi[feature] not in sub_sets:
sub_sets[xi[feature]] = []
sub_sets[xi[feature]].append(yi)
for value, sub_y in sub_sets.items():
p = len(sub_y) / float(len(y))
entropy += p * self.calc_entropy(sub_y)
gain = base_entropy - entropy
# 计算信息增益比
iv = sum([-1.0 * len(sub_y) / len(y) * math.log(len(sub_y) / len(y), 2) for sub_y in sub_sets.values()])
gain_ratio = gain / iv if iv != 0 else 0
return gain_ratio
def calc_continuous_attribute_gain_ratio(self, X, y, feature, split_points, base_entropy):
# 选择最优切分点
best_gain_ratio = 0.0
best_split_point = None
for split_point in split_points:
sub_y = [[], []]
for xi, yi in zip(X, y):
sub_y[int(xi[feature] > split_point)].append(yi)
# 如果某个子集为空,则跳过该切分点
if not sub_y[0] or not sub_y[1]:
continue
# 计算信息增益
entropy = sum([len(sub_y[i]) / float(len(y)) * self.calc_entropy(sub_y[i]) for i in range(2)])
gain = base_entropy - entropy
# 计算信息增益比
iv = sum([-1.0 * len(sub_y[i]) / len(y) * math.log(len(sub_y[i]) / len(y), 2) for i in range(2)])
gain_ratio = gain / iv if iv != 0 else 0
if gain_ratio > best_gain_ratio:
best_gain_ratio = gain_ratio
best_split_point = split_point
# 构建子集
left_X, left_y, right_X, right_y = self.split_data_set(X, y, feature, best_split_point)
# 构建子树
left_tree = self.build_tree(left_X, left_y, [f for f in features if f != feature])
right_tree = self.build_tree(right_X, right_y, [f for f in features if f != feature])
# 计算信息增益比
iv = -1.0 * len(left_y) / len(y) * math.log(len(left_y) / len(y), 2) \
-1.0 * len(right_y) / len(y) * math.log(len(right_y) / len(y), 2)
gain_ratio = best_gain_ratio / iv if iv != 0 else 0
return gain_ratio
def split_data_set(self, X, y, feature, value=None):
# 划分数据集
if value is None:
left_X = [xi for xi in X if xi[feature] != value]
left_y = [yi for xi, yi in zip(X, y) if xi[feature] != value]
right_X = [xi for xi in X if xi[feature] == value]
right_y = [yi for xi, yi in zip(X, y) if xi[feature] == value]
else:
left_X = [xi for xi in X if xi[feature] <= value]
left_y = [yi for xi, yi in zip(X, y) if xi[feature] <= value]
right_X = [xi for xi in X if xi[feature] > value]
right_y = [yi for xi, yi in zip(X, y) if xi[feature] > value]
return left_X, left_y, right_X, right_y
def predict(self, X):
return [self.predict_one(xi) for xi in X]
def predict_one(self, xi):
node = self.tree
while node.results is None:
if isinstance(xi[node.feature], float):
if xi[node.feature] <= node.value:
node = node.left
else:
node = node.right
else:
if xi[node.feature] == node.value:
node = node.left
else:
node = node.right
return node.results
def calc_entropy(self, y):
# 计算信息熵
entropy = 0.0
counter = Counter(y)
for _, cnt in counter.items():
p = cnt / float(len(y))
entropy -= p * math.log(p, 2)
return entropy
```
请注意,这只是一个简单的示例代码,实际应用中可能需要进行更多的优化和改进。
阅读全文