基于c4.5算法使用python实现算法实例
时间: 2023-10-12 16:09:53 浏览: 155
好的,以下是基于C4.5算法使用Python实现的示例代码:
```python
import numpy as np
import pandas as pd
class C45:
"""
C4.5算法实现类
"""
def __init__(self, epsilon=0.1):
self.epsilon = epsilon # 阈值
def calc_shannon_ent(self, data):
"""
计算数据集的熵
:param data: 数据集
:return: 熵
"""
n = len(data)
label_counts = {}
for row in data:
label = row[-1]
if label not in label_counts:
label_counts[label] = 0
label_counts[label] += 1
shannon_ent = 0.0
for key in label_counts:
prob = float(label_counts[key]) / n
shannon_ent -= prob * np.log2(prob)
return shannon_ent
def split_data(self, data, axis, value):
"""
按照给定特征划分数据集
:param data: 待划分的数据集
:param axis: 划分数据集的特征的索引
:param value: 需要返回的特征的值
:return: 划分后的数据集
"""
ret_data = []
for row in data:
if row[axis] == value:
reduced_row = row[:axis]
reduced_row.extend(row[axis + 1:])
ret_data.append(reduced_row)
return ret_data
def choose_best_feature(self, data):
"""
选择最好的数据集划分方式
:param data: 数据集
:return: 最好的划分方式的特征索引
"""
num_features = len(data[0]) - 1
base_entropy = self.calc_shannon_ent(data)
best_info_gain = 0.0
best_feature = -1
for i in range(num_features):
feat_list = [row[i] for row in data]
unique_vals = set(feat_list)
new_entropy = 0.0
for value in unique_vals:
sub_data = self.split_data(data, i, value)
prob = len(sub_data) / float(len(data))
new_entropy += prob * self.calc_shannon_ent(sub_data)
info_gain = base_entropy - new_entropy
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = i
return best_feature
def majority_cnt(self, label_list):
"""
统计类别出现的次数,返回出现次数最多的类别
:param label_list: 类别列表
:return: 出现次数最多的类别
"""
label_counts = {}
for vote in label_list:
if vote not in label_counts:
label_counts[vote] = 0
label_counts[vote] += 1
sorted_label_counts = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
return sorted_label_counts[0][0]
def create_tree(self, data, labels):
"""
创建决策树
:param data: 数据集
:param labels: 标签列表
:return: 决策树
"""
class_list = [row[-1] for row in data]
# 如果类别完全相同则停止划分
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
# 遍历完所有特征时返回出现次数最多的类别
if len(data[0]) == 1:
return self.majority_cnt(class_list)
# 选择最好的数据集划分方式
best_feat = self.choose_best_feature(data)
best_feat_label = labels[best_feat]
my_tree = {best_feat_label: {}}
del (labels[best_feat])
feat_values = [row[best_feat] for row in data]
unique_vals = set(feat_values)
for value in unique_vals:
sub_labels = labels[:]
my_tree[best_feat_label][value] = self.create_tree(self.split_data(data, best_feat, value), sub_labels)
return my_tree
```
以上是一个基于C4.5算法的决策树实现示例,可以通过调用该类的`create_tree`方法来创建决策树。
阅读全文