c4.5算法代码python
时间: 2023-10-05 20:08:01 浏览: 187
C4.5算法是一种经典的决策树算法,下面是一个简单的用Python实现的C4.5算法代码示例:
```python
import numpy as np
import pandas as pd
from math import log2
class Node:
def __init__(self, attribute=None, label=None):
self.attribute = attribute # 分裂的属性
self.label = label # 叶节点的类别
self.children = {} # 子节点
def entropy(data):
# 计算数据集的熵
labels = data[:, -1]
unique_labels, counts = np.unique(labels, return_counts=True)
probabilities = counts / len(labels)
entropy = -np.sum(probabilities * np.log2(probabilities))
return entropy
def split_data(data, attribute_index):
# 根据属性划分数据集
unique_values = np.unique(data[:, attribute_index])
splits = {}
for value in unique_values:
splits[value] = data[data[:, attribute_index] == value]
return splits
def choose_best_attribute(data, attributes):
# 选择最佳划分属性
entropy_before_split = entropy(data)
information_gains = []
for attribute_index in attributes:
splits = split_data(data, attribute_index)
entropy_after_split = 0
for value, split in splits.items():
entropy_after_split += (len(split) / len(data)) * entropy(split)
information_gain = entropy_before_split - entropy_after_split
information_gains.append(information_gain)
best_attribute_index = attributes[np.argmax(information_gains)]
return best_attribute_index
def majority_label(labels):
# 返回数量最多的类别作为叶节点的类别
unique_labels, counts = np.unique(labels, return_counts=True)
majority_label = unique_labels[np.argmax(counts)]
return majority_label
def create_decision_tree(data, attributes):
labels = data[:, -1]
# 如果所有样本属于同一类别,则创建叶节点
if len(np.unique(labels)) == 1:
return Node(label=labels[0])
# 如果没有可用属性了,则创建叶节点,类别为数量最多的类别
if len(attributes) == 0:
return Node(label=majority_label(labels))
best_attribute_index = choose_best_attribute(data, attributes)
best_attribute = attributes[best_attribute_index]
decision_tree = Node(attribute=best_attribute)
splits = split_data(data, best_attribute_index)
new_attributes = np.delete(attributes, best_attribute_index)
for value, split in splits.items():
if len(split) == 0:
decision_tree.children[value] = Node(label=majority_label(labels))
else:
decision_tree.children[value] = create_decision_tree(split, new_attributes)
return decision_tree
def predict(node, sample):
if node.label is not None: # 叶节点
return node.label
attribute_value = sample[node.attribute]
if attribute_value not in node.children: # 未知取值
return majority_label(list(node.children.values())[0].labels)
child_node = node.children[attribute_value]
return predict(child_node, sample)
# 示例用法
data = np.array([
[1, 1, 1],
[1, 1, 0],
[0, 1, 1],
[0, 0, 0]
])
attributes = np.array([0, 1])
decision_tree = create_decision_tree(data, attributes)
sample = [1, 0] # 待预测样本
prediction = predict(decision_tree, sample)
print("预测结果:", prediction)
```
这是一个简单的实现示例,你可以根据自己的需求进行修改和扩展。希望对你有帮助!
阅读全文