实现C4.5算法,验证算法的正确性,并将算法应用于C:/Users/Administrator/Desktop/Bank-data
时间: 2023-12-31 16:04:27 浏览: 28
C4.5算法是一种常用的决策树算法,用于分类和预测。具体实现步骤如下:
1. 数据预处理:将数据集按照一定比例分为训练集和测试集,并将数据离散化。
2. 特征选择:计算每个特征的信息增益或信息增益比,选择信息增益或信息增益比最大的特征作为划分属性。
3. 决策树生成:以选定的划分属性作为根节点,将数据集分为多个子集,并递归生成子树。
4. 决策树剪枝:通过预测错误率进行剪枝,得到最终的决策树。
下面是Python实现C4.5算法的代码:
```python
import numpy as np
import pandas as pd
from math import log
# 定义节点类
class Node:
def __init__(self, feature, label):
self.feature = feature # 划分属性
self.label = label # 叶节点类别
self.child = {} # 子节点字典
# 计算信息熵
def cal_entropy(data):
n = len(data)
label_count = {}
for i in range(n):
label = data[i][-1]
if label not in label_count.keys():
label_count[label] = 0
label_count[label] += 1
entropy = 0.0
for key in label_count:
p = float(label_count[key]) / n
entropy -= p * log(p, 2)
return entropy
# 计算信息增益
def cal_info_gain(data, feature):
n = len(data)
feature_count = {}
for i in range(n):
value = data[i][feature]
if value not in feature_count.keys():
feature_count[value] = 0
feature_count[value] += 1
feature_entropy = 0.0
for key in feature_count:
p = float(feature_count[key]) / n
sub_data = [example for example in data if example[feature] == key]
feature_entropy += p * cal_entropy(sub_data)
info_gain = cal_entropy(data) - feature_entropy
return info_gain
# 计算信息增益比
def cal_info_gain_ratio(data, feature):
n = len(data)
feature_count = {}
for i in range(n):
value = data[i][feature]
if value not in feature_count.keys():
feature_count[value] = 0
feature_count[value] += 1
feature_entropy = 0.0
split_info = 0.0
for key in feature_count:
p = float(feature_count[key]) / n
sub_data = [example for example in data if example[feature] == key]
feature_entropy += p * cal_entropy(sub_data)
split_info -= p * log(p, 2)
if split_info == 0:
return 0
info_gain_ratio = (cal_entropy(data) - feature_entropy) / split_info
return info_gain_ratio
# 选择最优划分属性
def choose_best_feature(data, feature_list, algorithm='ID3'):
best_feature = -1
best_gain = 0.0
for i in feature_list:
if algorithm == 'ID3':
info_gain = cal_info_gain(data, i)
elif algorithm == 'C4.5':
info_gain = cal_info_gain_ratio(data, i)
if info_gain > best_gain:
best_gain = info_gain
best_feature = i
return best_feature
# 划分数据集
def split_data(data, feature):
sub_data = []
for example in data:
if example[feature] == key:
reduced_example = example[:feature]
reduced_example.extend(example[feature+1:])
sub_data.append(reduced_example)
return sub_data
# 构建决策树
def create_tree(data, feature_list, algorithm='ID3'):
label_list = [example[-1] for example in data]
# 如果数据集中所有实例属于同一类别,则返回单节点树
if label_list.count(label_list[0]) == len(label_list):
return Node(None, label_list[0])
# 如果特征集为空,则返回单节点树,其中类别为数据集中实例数最多的类别
if len(feature_list) == 0:
label_count = {}
for i in range(len(label_list)):
if label_list[i] not in label_count.keys():
label_count[label_list[i]] = 0
label_count[label_list[i]] += 1
max_count = 0
for key in label_count:
if label_count[key] > max_count:
max_count = label_count[key]
max_label = key
return Node(None, max_label)
# 选择最优划分属性
best_feature = choose_best_feature(data, feature_list, algorithm)
# 如果最优划分属性的信息增益或信息增益比小于阈值,则返回单节点树,其中类别为数据集中实例数最多的类别
if best_feature == -1:
label_count = {}
for i in range(len(label_list)):
if label_list[i] not in label_count.keys():
label_count[label_list[i]] = 0
label_count[label_list[i]] += 1
max_count = 0
for key in label_count:
if label_count[key] > max_count:
max_count = label_count[key]
max_label = key
return Node(None, max_label)
# 构建子树
feature_list.remove(best_feature)
node = Node(best_feature, None)
feature_values = [example[best_feature] for example in data]
unique_values = set(feature_values)
for value in unique_values:
sub_data = split_data(data, best_feature, value)
sub_feature_list = feature_list[:]
node.child[value] = create_tree(sub_data, sub_feature_list, algorithm)
return node
# 决策树分类
def classify(tree, feature_labels, test_data):
first_str = list(tree.child.keys())[0]
second_dict = tree.child[first_str]
feature_index = feature_labels.index(tree.feature)
for key in second_dict.keys():
if test_data[feature_index] == key:
if type(second_dict[key]).__name__ == 'Node':
class_label = classify(second_dict[key], feature_labels, test_data)
else:
class_label = second_dict[key]
return class_label
# 计算预测错误率
def cal_error_rate(tree, feature_labels, test_data_set):
error_count = 0
for test_data in test_data_set:
if classify(tree, feature_labels, test_data) != test_data[-1]:
error_count += 1
error_rate = float(error_count) / len(test_data_set)
return error_rate
# 主函数
if __name__ == '__main__':
# 读取数据
data = pd.read_csv('C:/Users/Administrator/Desktop/Bank-data.csv')
# 数据预处理
data = data.dropna() # 去掉缺失值
data = data.sample(frac=1).reset_index(drop=True) # 随机打乱数据集
n = len(data)
m = len(data.columns) - 1
train_data = data[:int(0.7*n)]
test_data = data[int(0.7*n):]
train_data_set = train_data.values.tolist()
test_data_set = test_data.values.tolist()
# 特征选择
feature_list = list(range(m))
# 构建决策树
algorithm = 'C4.5'
tree = create_tree(train_data_set, feature_list, algorithm)
# 输出决策树
print(tree)
# 预测测试集并计算错误率
feature_labels = list(data.columns[:-1])
error_rate = cal_error_rate(tree, feature_labels, test_data_set)
print('错误率:', error_rate)
```
以上代码中,我们以银行客户数据集为例,利用C4.5算法构建决策树,并对模型进行测试并计算错误率。