c4.5算法代码python鸢尾花
时间: 2023-09-25 08:15:40 浏览: 168
以下是使用Python实现C4.5算法对鸢尾花数据集进行分类的示例代码:
```python
import pandas as pd
import numpy as np
import math
# 读入鸢尾花数据集
data = pd.read_csv('iris.csv')
# 定义C4.5算法决策树节点类
class Node:
def __init__(self, data, feature_names, target_name):
self.data = data
self.feature_names = feature_names
self.target_name = target_name
self.feature = None
self.threshold = None
self.children = []
self.label = None
def set_label(self, label):
self.label = label
def set_feature(self, feature, threshold):
self.feature = feature
self.threshold = threshold
def add_child(self, node):
self.children.append(node)
# 计算数据集的信息熵
@staticmethod
def entropy(data):
num_samples = len(data)
if num_samples == 0:
return 0
num_classes = len(set(data.iloc[:, -1]))
if num_classes <= 1:
return 0
class_counts = data.iloc[:, -1].value_counts()
class_probs = class_counts / num_samples
return -np.sum(class_probs * np.log2(class_probs))
# 计算数据集的信息增益比
def gain_ratio(self, data, feature):
num_samples = len(data)
if num_samples == 0:
return 0
info_gain = self.info_gain(data, feature)
intrinsic_value = self.intrinsic_value(data, feature)
return info_gain / intrinsic_value
# 计算数据集相对于某个特征的信息增益
def info_gain(self, data, feature):
num_samples = len(data)
if num_samples == 0:
return 0
entropy_before = self.entropy(data)
values = set(data.iloc[:, feature])
entropy_after = 0
for value in values:
subset = data[data.iloc[:, feature] == value]
entropy_after += len(subset) / num_samples * self.entropy(subset)
return entropy_before - entropy_after
# 计算数据集相对于某个特征的固有值
def intrinsic_value(self, data, feature):
num_samples = len(data)
if num_samples == 0:
return 0
values = set(data.iloc[:, feature])
iv = 0
for value in values:
subset = data[data.iloc[:, feature] == value]
prob = len(subset) / num_samples
iv -= prob * math.log2(prob)
return iv
# 选择最优特征
def choose_feature(self, data):
num_features = len(data.columns) - 1
best_feature = None
best_gain_ratio = 0
for feature in range(num_features):
gain_ratio = self.gain_ratio(data, feature)
if gain_ratio > best_gain_ratio:
best_feature = feature
best_gain_ratio = gain_ratio
return best_feature, best_gain_ratio
# 构建决策树
def build_tree(self):
if len(set(self.data.iloc[:, -1])) == 1:
self.set_label(self.data.iloc[0, -1])
return
best_feature, best_gain_ratio = self.choose_feature(self.data)
if best_feature is None:
self.set_label(self.data.iloc[:, -1].mode()[0])
return
self.set_feature(self.feature_names[best_feature], best_gain_ratio)
values = set(self.data.iloc[:, best_feature])
for value in values:
subset = self.data[self.data.iloc[:, best_feature] == value].drop(self.feature_names[best_feature], axis=1)
child = Node(subset, self.feature_names, self.target_name)
child.build_tree()
self.add_child(child)
# 预测单个样本的类别
def predict(self, sample):
if self.label is not None:
return self.label
for child in self.children:
if sample[self.feature] <= self.threshold:
return child.predict(sample.drop(self.feature))
return self.children[-1].predict(sample.drop(self.feature))
# 将数据集分成训练集和测试集
train_data = data.sample(frac=0.8, random_state=1)
test_data = data.drop(train_data.index)
# 构建决策树
root = Node(train_data, train_data.columns[:-1], train_data.columns[-1])
root.build_tree()
# 对测试集进行预测
num_correct = 0
for _, row in test_data.iterrows():
prediction = root.predict(row[:-1])
if prediction == row[-1]:
num_correct += 1
print('Accuracy:', num_correct / len(test_data))
```
注:此代码是一个基础实现,可能存在性能瓶颈和优化空间。
阅读全文