给一个csv文件数据,python实现c4.5决策树算法
时间: 2024-02-11 19:08:58 浏览: 42
python实现C4.5决策树算法
好的,下面是一个简单的示例,使用Python实现C4.5决策树算法:
```python
import pandas as pd
import numpy as np
from math import log2
# 计算数据集的信息熵
def calc_entropy(data):
target_col = data.iloc[:, -1]
_, counts = np.unique(target_col, return_counts=True)
freqs = counts / counts.sum()
entropy = -freqs.dot(np.log2(freqs))
return entropy
# 计算数据集的信息增益
def calc_info_gain(data, feature):
entropy_before = calc_entropy(data)
vals, counts = np.unique(data[feature], return_counts=True)
freqs = counts / counts.sum()
entropy_after = np.sum(freqs * np.array([calc_entropy(data[data[feature]==v]) for v in vals]))
info_gain = entropy_before - entropy_after
return info_gain
# 计算数据集的基尼指数
def calc_gini(data):
target_col = data.iloc[:, -1]
_, counts = np.unique(target_col, return_counts=True)
freqs = counts / counts.sum()
gini = 1 - np.sum(freqs ** 2)
return gini
# 计算数据集的基尼增益
def calc_gini_gain(data, feature):
gini_before = calc_gini(data)
vals, counts = np.unique(data[feature], return_counts=True)
freqs = counts / counts.sum()
gini_after = np.sum(freqs * np.array([calc_gini(data[data[feature]==v]) for v in vals]))
gini_gain = gini_before - gini_after
return gini_gain
# 选择最佳分裂特征
def choose_best_feature(data, criterion):
if criterion == 'entropy':
func = calc_info_gain
elif criterion == 'gini':
func = calc_gini_gain
else:
raise ValueError('Invalid criterion')
info_gains = [func(data, feature) for feature in data.columns[:-1]]
best_feature_idx = np.argmax(info_gains)
return data.columns[best_feature_idx]
# 构造决策树
def build_tree(data, criterion):
target_col = data.iloc[:, -1]
if len(set(target_col)) == 1:
return target_col.values[0]
if data.shape[1] == 1:
return target_col.value_counts().idxmax()
best_feature = choose_best_feature(data, criterion)
tree = {best_feature: {}}
for val in np.unique(data[best_feature]):
subset = data[data[best_feature] == val].drop(best_feature, axis=1)
subtree = build_tree(subset, criterion)
tree[best_feature][val] = subtree
return tree
# 预测新样本的类别
def predict(tree, sample):
for feature in tree.keys():
val = sample[feature]
if val in tree[feature]:
subtree = tree[feature][val]
if isinstance(subtree, dict):
return predict(subtree, sample)
else:
return subtree
# 加载数据集
data = pd.read_csv('dataset.csv')
# 构建决策树
tree = build_tree(data, 'entropy')
# 预测新样本的类别
sample = {'age': 'young', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}
prediction = predict(tree, sample)
print('Prediction:', prediction)
```
以上代码中,`calc_entropy()`、`calc_gini()`、`calc_info_gain()`、`calc_gini_gain()`函数分别用于计算数据集的信息熵、基尼指数、信息增益和基尼增益,`choose_best_feature()`函数用于选择最佳分裂特征,`build_tree()`函数用于构建决策树,`predict()`函数用于预测新样本的类别。最后使用示例数据集进行测试,预测结果为`Prediction: no`。
阅读全文