决策树c4.5 python代码 txt
时间: 2023-07-31 10:01:29 浏览: 102
决策树是一种常用的机器学习算法,可以用来进行分类和回归问题的预测。C4.5是一种决策树算法的改进版,它采用了信息增益比来选择最优的划分属性。
以下是一个用Python实现C4.5算法的代码示例:
```
import numpy as np
import pandas as pd
from math import log2
# 计算信息熵
def calculate_entropy(data):
labels = data.iloc[:, -1]
label_counts = labels.value_counts()
probs = label_counts / label_counts.sum()
entropy = (-probs * np.log2(probs)).sum()
return entropy
# 计算信息增益
def calculate_information_gain(data, feature):
total_entropy = calculate_entropy(data)
feature_values = data[feature].unique()
weighted_entropy = 0
for value in feature_values:
subset = data[data[feature]==value]
subset_entropy = calculate_entropy(subset)
weighted_entropy += (subset.shape[0] / data.shape[0]) * subset_entropy
information_gain = total_entropy - weighted_entropy
return information_gain
# 计算信息增益比
def calculate_information_gain_ratio(data, feature):
information_gain = calculate_information_gain(data, feature)
intrinsic_value = 0
feature_values = data[feature].unique()
for value in feature_values:
subset = data[data[feature]==value]
prob = subset.shape[0] / data.shape[0]
intrinsic_value += prob * log2(prob)
information_gain_ratio = information_gain / (-intrinsic_value)
return information_gain_ratio
# 选择最优的划分属性
def select_best_feature(data):
features = data.columns[:-1]
best_feature = None
best_information_gain_ratio = 0
for feature in features:
information_gain_ratio = calculate_information_gain_ratio(data, feature)
if information_gain_ratio > best_information_gain_ratio:
best_information_gain_ratio = information_gain_ratio
best_feature = feature
return best_feature
# 构建决策树
def build_decision_tree(data):
labels = data.iloc[:, -1]
if len(set(labels)) == 1:
return labels.iloc[0]
if data.shape[1] == 1:
return labels.value_counts().idxmax()
best_feature = select_best_feature(data)
decision_tree = {best_feature: {}}
feature_values = data[best_feature].unique()
for value in feature_values:
subset = data[data[best_feature]==value].drop(best_feature, axis=1)
decision_tree[best_feature][value] = build_decision_tree(subset)
return decision_tree
# 预测新样本
def predict(decision_tree, sample):
if isinstance(decision_tree, str):
return decision_tree
feature = list(decision_tree.keys())[0]
value = sample[feature]
sub_tree = decision_tree[feature][value]
return predict(sub_tree, sample)
# 示例数据
data = pd.DataFrame({
'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy'],
'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild'],
'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal'],
'Windy': ['False', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'False'],
'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes']
})
decision_tree = build_decision_tree(data)
# 对新样本进行预测
sample = {'Outlook': 'Sunny', 'Temperature': 'Hot', 'Humidity': 'Normal', 'Windy': 'False'}
prediction = predict(decision_tree, sample)
print("叶子节点的预测结果:", decision_tree)
print("新样本的预测结果:", prediction)
```
以上代码是一个简单的C4.5算法的实现,通过构建决策树来进行分类预测。代码中使用了Pandas库来处理和分析数据,使用了NumPy库来进行数学运算。首先,定义了计算信息熵、信息增益和信息增益比的函数,然后定义了选择最优划分属性的函数,接着定义了构建决策树和预测新样本的函数。最后,使用一个示例数据来构建决策树,并对一个新样本进行预测。
阅读全文