请给出C4.5算法的代码,训练一个数据集并实现可视化
时间: 2023-06-13 18:07:17 浏览: 104
C4.5算法是一种经典的决策树算法,它的代码实现比较复杂,需要涉及到数据预处理、特征选择、决策树的构建等多个步骤。这里给出一个简单的Python实现,仅供参考。
首先,我们需要引入一些必要的库:
```python
import numpy as np
import pandas as pd
import math
from collections import Counter
import matplotlib.pyplot as plt
```
接着,我们定义一个C4.5类,其中包含了决策树的各个步骤:
```python
class C4_5:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon # 停止划分的阈值
self.tree = {} # 决策树
def calc_shannon_entropy(self, labels):
n = len(labels)
label_counts = Counter(labels)
shannon_ent = 0.0
for key in label_counts:
prob = float(label_counts[key]) / n
shannon_ent -= prob * math.log(prob, 2)
return shannon_ent
def split_dataset(self, dataset, axis, value):
ret_dataset = []
for feat_vec in dataset:
if feat_vec[axis] == value:
reduced_feat_vec = feat_vec[:axis]
reduced_feat_vec.extend(feat_vec[axis+1:])
ret_dataset.append(reduced_feat_vec)
return ret_dataset
def choose_best_feature(self, dataset, labels):
n_features = len(dataset[0])
base_entropy = self.calc_shannon_entropy(labels)
best_info_gain_ratio = 0.0
best_feature = -1
for i in range(n_features):
feat_list = [example[i] for example in dataset]
unique_vals = set(feat_list)
new_entropy = 0.0
split_info = 0.0
for value in unique_vals:
sub_dataset = self.split_dataset(dataset, i, value)
prob = len(sub_dataset) / float(len(dataset))
new_entropy += prob * self.calc_shannon_entropy([example[-1] for example in sub_dataset])
split_info -= prob * math.log(prob, 2)
info_gain = base_entropy - new_entropy
if (split_info == 0): # 避免除0错误
continue
info_gain_ratio = info_gain / split_info # 计算信息增益比
if info_gain_ratio > best_info_gain_ratio:
best_info_gain_ratio = info_gain_ratio
best_feature = i
return best_feature
def majority_cnt(self, labels):
label_counts = Counter(labels)
sorted_label_counts = sorted(label_counts.items(), key=lambda x:x[1], reverse=True)
return sorted_label_counts[0][0]
def create_tree(self, dataset, labels, feat_labels):
class_list = [example[-1] for example in dataset]
if class_list.count(class_list[0]) == len(class_list):
return class_list[0] # 类别完全相同,停止划分
if len(dataset[0]) == 1: # 遍历完所有特征
return self.majority_cnt(class_list)
best_feat = self.choose_best_feature(dataset, labels)
best_feat_label = feat_labels[best_feat]
my_tree = {best_feat_label:{}}
del(feat_labels[best_feat])
feat_vals = [example[best_feat] for example in dataset]
unique_vals = set(feat_vals)
for value in unique_vals:
sub_feat_labels = feat_labels[:]
my_tree[best_feat_label][value] = self.create_tree(self.split_dataset(dataset, best_feat, value),
[example[-1] for example in self.split_dataset(dataset, best_feat, value)],
sub_feat_labels)
return my_tree
def fit(self, X, y, feat_labels):
dataset = np.concatenate((X, y.reshape(-1, 1)), axis=1).tolist()
labels = y.tolist()
self.tree = self.create_tree(dataset, labels, feat_labels)
def predict(self, x):
def classify(input_tree, feat_labels, test_vec):
first_str = list(input_tree.keys())[0]
second_dict = input_tree[first_str]
feat_index = feat_labels.index(first_str)
for key in second_dict.keys():
if test_vec[feat_index] == key:
if type(second_dict[key]).__name__ == 'dict':
class_label = classify(second_dict[key], feat_labels, test_vec)
else:
class_label = second_dict[key]
return class_label
return classify(self.tree, list(x.index), list(x))
def plot_tree(self):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
create_plot.ax1 = plt.subplot(111, frameon=False, **axprops)
plot_tree.total_width = float(self.get_tree_depth(self.tree))
plot_tree.total_depth = float(self.get_num_leaves(self.tree))
plot_tree.x_off = -0.5 / plot_tree.total_width
plot_tree.y_off = 1.0
plot_tree(self.tree, (0.5, 1.0), '')
plt.show()
def get_num_leaves(self, my_tree):
num_leaves = 0
first_str = list(my_tree.keys())[0]
second_dict = my_tree[first_str]
for key in second_dict.keys():
if type(second_dict[key]).__name__ == 'dict':
num_leaves += self.get_num_leaves(second_dict[key])
else:
num_leaves += 1
return num_leaves
def get_tree_depth(self, my_tree):
max_depth = 0
first_str = list(my_tree.keys())[0]
second_dict = my_tree[first_str]
for key in second_dict.keys():
if type(second_dict[key]).__name__ == 'dict':
this_depth = 1 + self.get_tree_depth(second_dict[key])
else:
this_depth = 1
if this_depth > max_depth:
max_depth = this_depth
return max_depth
```
在上述代码中,我们定义了一个C4.5类,包含了以下方法:
- `__init__`:初始化方法,设置停止划分的阈值和决策树。
- `calc_shannon_entropy`:计算信息熵。
- `split_dataset`:按照给定特征的值划分数据集。
- `choose_best_feature`:选择最优划分特征。
- `majority_cnt`:统计出现最多的类别。
- `create_tree`:递归构建决策树。
- `fit`:训练模型。
- `predict`:对单个样本进行预测。
- `plot_tree`:可视化决策树。
- `get_num_leaves`:统计决策树的叶子节点数。
- `get_tree_depth`:统计决策树的深度。
其中,`fit`方法需要输入训练数据集、标签以及特征标签,返回训练好的决策树;`predict`方法需要输入一个样本,返回该样本的预测类别;`plot_tree`方法可视化决策树。
接下来,我们使用鸢尾花数据集进行训练和可视化:
```python
from sklearn.datasets import load_iris
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)
c45 = C4_5()
c45.fit(X, y, X.columns.tolist())
c45.plot_tree()
```
运行上述代码,即可得到可视化的决策树。
阅读全文