c4.5Python实现

是C4.5算法的Python实现： ```python import math import pandas as pd class Node: def __init__(self, feature=None, threshold=None, label=None, left=None, right=None): self.feature = feature # 划分数据集的特征 self.threshold = threshold # 划分数据集的特征阈值 self.label = label # 叶子节点的类别标签 self.left = left # 左子树 self.right = right # 右子树 class C45DecisionTree: def __init__(self, epsilon=0.1): self.epsilon = epsilon # 停止划分的阈值 self.tree = None # 决策树 def fit(self, X, y): self.tree = self._build_tree(X, y) def predict(self, X): return [self._predict_one(x) for x in X] def _build_tree(self, X, y): # 如果所有样本属于同一类别，返回叶子节点 if len(set(y)) == 1: return Node(label=y[0]) # 如果没有特征可用于划分，返回叶子节点 if len(X.columns) == 0: return Node(label=self._majority_class(y)) # 选择最优特征和阈值 best_feature, best_threshold = self._choose_best_feature(X, y) # 如果最优特征的信息增益小于阈值，返回叶子节点 if best_feature is None or best_threshold is None: return Node(label=self._majority_class(y)) # 根据最优特征和阈值划分数据集 left_idx = X[best_feature] <= best_threshold right_idx = X[best_feature] > best_threshold left = self._build_tree(X[left_idx], y[left_idx]) right = self._build_tree(X[right_idx], y[right_idx]) # 返回当前节点 return Node(feature=best_feature, threshold=best_threshold, left=left, right=right) def _choose_best_feature(self, X, y): best_feature, best_threshold, max_gain_ratio = None, None, -1 # 计算信息增益和每个特征的分裂信息 entropy = self._entropy(y) for feature in X.columns: if X[feature].dtype == 'object': # 离散特征 gain_ratio, split_info = self._discrete_gain_ratio(X[feature], y, entropy) if gain_ratio > max_gain_ratio: best_feature, best_threshold, max_gain_ratio = feature, None, gain_ratio else: # 连续特征 gain_ratio, threshold = self._continuous_gain_ratio(X[feature], y, entropy) if gain_ratio > max_gain_ratio: best_feature, best_threshold, max_gain_ratio = feature, threshold, gain_ratio # 如果信息增益小于阈值，返回None if max_gain_ratio < self.epsilon: return None, None return best_feature, best_threshold def _discrete_gain_ratio(self, feature, y, entropy): # 计算信息增益 gain = entropy - sum([(sum(y == c) / len(y)) * self._entropy(y[feature == c]) for c in set(feature)]) # 计算分裂信息 split_info = -sum([(sum(feature == c) / len(feature)) * math.log2(sum(feature == c) / len(feature)) for c in set(feature)]) # 计算信息增益率 gain_ratio = gain / split_info return gain_ratio, split_info def _continuous_gain_ratio(self, feature, y, entropy): # 对特征值进行排序 sorted_idx = feature.argsort() sorted_feature, sorted_y = feature[sorted_idx], y[sorted_idx] # 遍历每个特征值，计算信息增益率 thresholds, classes = sorted_feature[:-1], (sorted_y[:-1] + sorted_y[1:]) / 2 gains = [entropy - sum([(sum(y[feature <= t] == c) / len(y[feature <= t])) * self._entropy(y[feature <= t][y[feature <= t] == c]) + (sum(y[feature > t] == c) / len(y[feature > t])) * self._entropy(y[feature > t][y[feature > t] == c]) for c in set(y)]) for t in thresholds] gain_ratios = gains / self._split_info(classes) # 找到信息增益率最大的特征值 best_idx = np.argmax(gain_ratios) return gain_ratios[best_idx], thresholds[best_idx] def _split_info(self, classes): # 计算分裂信息 return -sum([(sum(classes == c) / len(classes)) * math.log2(sum(classes == c) / len(classes)) for c in set(classes)]) def _entropy(self, y): # 计算信息熵 return -sum([(sum(y == c) / len(y)) * math.log2(sum(y == c) / len(y)) for c in set(y)]) def _majority_class(self, y): # 返回样本数最多的类别 return pd.Series(y).mode()[0] def _predict_one(self, x): # 遍历决策树，直到叶子节点 node = self.tree while node.label is None: if x[node.feature] <= node.threshold: node = node.left else: node = node.right return node.label ```

相关推荐

python实现C4.5决策树算法

ID3，C4.5，Python

Python实现决策树C4.5算法的示例

决策树C4.5python

c4.5算法python实现sklearn

决策树C4.5python代码

决策树c4.5python代码

决策树c4.5python

C4.5算法的python实现

python实现C4.5

用python实现c4.5算法

决策树c4.5python鸢尾花

决策树c4.5 python代码 txt

c4.5决策树python实现实例

python实现c4.5决策树

C4.5算法python

c4.5算法python

python实现c4.5决策树代码

python实现C4.5算法的代码

最新推荐

k8s1.16的jenkins部署java项目cicd(cd手动)-kubernetes安装包和详细文档笔记整理

zigbee-cluster-library-specification

管理建模和仿真的文件

MATLAB柱状图在信号处理中的应用：可视化信号特征和频谱分析

用Spring boot和vue写一个登录注册界面

JSBSim Reference Manual

"互动学习：行动中的多样性与论文攻读经历"

MATLAB柱状图在数据分析中的作用：从可视化到洞察

命名ACL和拓展ACL标准ACL的具体区别

c++校园超市商品信息管理系统课程设计说明书(含源代码) (2).pdf