用python写含注释的c4.5决策树代码
时间: 2024-05-05 10:21:14 浏览: 16
这里提供一个简单的Python实现C4.5决策树的示例代码,包含注释,希望对您有帮助。代码如下:
```python
import numpy as np
import pandas as pd
import math
class C45DecisionTree:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon # 停止划分阈值
self.tree = {}
# 计算数据集的经验熵
def calc_entropy(self, y):
n = len(y)
label_count = {}
for label in y:
if label not in label_count:
label_count[label] = 0
label_count[label] += 1
entropy = 0.0
for label in label_count:
prob = float(label_count[label]) / n
entropy -= prob * math.log2(prob)
return entropy
# 计算条件熵
def calc_cond_entropy(self, x, y):
n = len(y)
feature_count = {}
for i in range(n):
feature = x[i]
label = y[i]
if feature not in feature_count:
feature_count[feature] = {}
if label not in feature_count[feature]:
feature_count[feature][label] = 0
feature_count[feature][label] += 1
cond_entropy = 0.0
for feature in feature_count:
feature_prob = sum(feature_count[feature].values()) / n
feature_cond_entropy = 0.0
for label in feature_count[feature]:
prob = float(feature_count[feature][label]) / feature_prob
feature_cond_entropy -= prob * math.log2(prob)
cond_entropy += feature_prob * feature_cond_entropy
return cond_entropy
# 计算信息增益率
def calc_info_gain_ratio(self, x, y):
entropy = self.calc_entropy(y)
cond_entropy = self.calc_cond_entropy(x, y)
info_gain = entropy - cond_entropy
# 计算分裂信息
split_info = 0.0
n = len(y)
feature_count = {}
for feature in x:
if feature not in feature_count:
feature_count[feature] = 0
feature_count[feature] += 1
for feature in feature_count:
prob = float(feature_count[feature]) / n
split_info -= prob * math.log2(prob)
# 计算信息增益率
if split_info == 0:
return 0
else:
return info_gain / split_info
# 选择最优划分特征
def choose_best_feature(self, X, y):
m = len(X[0])
best_feature = -1
best_info_gain_ratio = 0.0
for i in range(m):
x_i = [X[j][i] for j in range(len(X))]
info_gain_ratio = self.calc_info_gain_ratio(x_i, y)
if info_gain_ratio > best_info_gain_ratio:
best_info_gain_ratio = info_gain_ratio
best_feature = i
return best_feature
# 获取标签中出现次数最多的值
def get_majority_label(self, y):
label_count = {}
for label in y:
if label not in label_count:
label_count[label] = 0
label_count[label] += 1
sorted_label_count = sorted(label_count.items(), key=lambda x: x[1], reverse=True)
return sorted_label_count[0][0]
# 递归构建决策树
def create_tree(self, X, y, feature_names):
# 如果只有一个类别,返回这个类别
if len(set(y)) == 1:
return y[0]
# 如果特征集为空,返回标签中出现次数最多的类别
if len(X[0]) == 0:
return self.get_majority_label(y)
# 如果数据集中的样本都属于同一个类别,返回这个类别
if len(set([tuple(x) for x in X])) == 1:
return y[0]
# 选择最优划分特征
best_feature_index = self.choose_best_feature(X, y)
best_feature_name = feature_names[best_feature_index]
tree = {best_feature_name: {}}
# 删除已选择的特征
del feature_names[best_feature_index]
# 划分子集
feature_values = [example[best_feature_index] for example in X]
unique_feature_values = set(feature_values)
for feature_value in unique_feature_values:
sub_feature_names = feature_names[:]
sub_X = []
sub_y = []
for i in range(len(X)):
if X[i][best_feature_index] == feature_value:
sub_X.append(X[i][:best_feature_index] + X[i][best_feature_index+1:])
sub_y.append(y[i])
sub_tree = self.create_tree(sub_X, sub_y, sub_feature_names)
tree[best_feature_name][feature_value] = sub_tree
return tree
# 训练决策树
def fit(self, X, y, feature_names):
self.tree = self.create_tree(X, y, feature_names)
# 预测单个样本
def predict_single(self, X_sample):
tree = self.tree
while type(tree) == dict:
feature_name = list(tree.keys())[0]
feature_value = X_sample[feature_name]
tree = tree[feature_name][feature_value]
return tree
# 预测多个样本
def predict(self, X):
y_pred = []
for sample in X:
y_pred.append(self.predict_single(sample))
return y_pred
```
使用示例:
```python
# 读取数据
data = pd.read_csv('data.csv')
X = data.iloc[:, :-1].to_dict(orient='records')
y = data.iloc[:, -1].tolist()
feature_names = list(data.columns[:-1])
# 构建决策树
tree = C45DecisionTree(epsilon=0.1)
tree.fit(X, y, feature_names)
# 预测单个样本
X_sample = {'age': 'middle_aged', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}
y_pred = tree.predict_single(X_sample)
print(y_pred)
# 预测多个样本
X_test = [{'age': 'youth', 'income': 'low', 'student': 'no', 'credit_rating': 'fair'},
{'age': 'middle_aged', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'},
{'age': 'senior', 'income': 'low', 'student': 'no', 'credit_rating': 'fair'},
{'age': 'youth', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'},
{'age': 'youth', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}]
y_pred = tree.predict(X_test)
print(y_pred)
```