请给出C4.5算法的代码,并实现可视化
时间: 2023-06-13 08:07:25 浏览: 134
C4.5算法是一种经典的决策树算法,其代码实现涉及到数据处理和决策树构建等多个方面。这里给出一个简单的Python实现,并提供可视化代码。
首先,我们需要定义一个节点类,用于表示决策树中的每个节点:
```python
class Node:
def __init__(self, feature=None, threshold=None, label=None, left=None, right=None):
self.feature = feature # 划分特征
self.threshold = threshold # 划分阈值
self.label = label # 叶节点标签
self.left = left # 左子树
self.right = right # 右子树
```
接下来,我们需要实现C4.5算法中的关键步骤,包括计算信息熵、计算信息增益、选择最优划分特征等。下面是这些步骤的代码实现:
```python
import numpy as np
from collections import Counter
def calc_entropy(y):
"""计算信息熵"""
counter = Counter(y)
probs = [val / len(y) for val in counter.values()]
return -sum(p * np.log2(p) for p in probs)
def calc_cond_entropy(x, y, threshold):
"""计算条件熵"""
left_idx = np.where(x <= threshold)[0]
right_idx = np.where(x > threshold)[0]
left_probs, right_probs = len(left_idx) / len(x), len(right_idx) / len(x)
left_entropy, right_entropy = calc_entropy(y[left_idx]), calc_entropy(y[right_idx])
return left_probs * left_entropy + right_probs * right_entropy
def calc_info_gain(x, y):
"""计算信息增益"""
base_entropy = calc_entropy(y)
best_gain, best_feature, best_threshold = 0, None, None
for feature_idx in range(x.shape[1]):
for threshold in np.unique(x[:, feature_idx]):
cond_entropy = calc_cond_entropy(x[:, feature_idx], y, threshold)
gain = base_entropy - cond_entropy
if gain > best_gain:
best_gain, best_feature, best_threshold = gain, feature_idx, threshold
return best_gain, best_feature, best_threshold
```
接下来是C4.5算法的核心代码,即递归构建决策树的函数:
```python
def build_tree(x, y, max_depth=None, min_samples_split=2, min_impurity_decrease=0):
"""构建决策树"""
if max_depth is not None and max_depth == 0:
return Node(label=Counter(y).most_common(1)[0][0])
if len(y) < min_samples_split or calc_entropy(y) < min_impurity_decrease:
return Node(label=Counter(y).most_common(1)[0][0])
gain, feature_idx, threshold = calc_info_gain(x, y)
if gain == 0:
return Node(label=Counter(y).most_common(1)[0][0])
left_idx, right_idx = np.where(x[:, feature_idx] <= threshold)[0], np.where(x[:, feature_idx] > threshold)[0]
left = build_tree(x[left_idx], y[left_idx], max_depth=max_depth-1 if max_depth is not None else None,
min_samples_split=min_samples_split, min_impurity_decrease=min_impurity_decrease)
right = build_tree(x[right_idx], y[right_idx], max_depth=max_depth-1 if max_depth is not None else None,
min_samples_split=min_samples_split, min_impurity_decrease=min_impurity_decrease)
return Node(feature=feature_idx, threshold=threshold, left=left, right=right)
```
最后,我们可以使用graphviz库来可视化决策树。下面是可视化代码的实现:
```python
from graphviz import Digraph
def visualize_tree(tree, dot=None):
"""可视化决策树"""
if dot is None:
dot = Digraph()
if tree.label is not None:
dot.node(str(id(tree)), label=str(tree.label), shape='box')
else:
dot.node(str(id(tree)), label=f'feature {tree.feature}\n<= {tree.threshold:.2f}')
visualize_tree(tree.left, dot)
visualize_tree(tree.right, dot)
dot.edge(str(id(tree)), str(id(tree.left)), label='True')
dot.edge(str(id(tree)), str(id(tree.right)), label='False')
return dot
```
这样,我们就完成了C4.5算法的实现和可视化。完整代码如下:
```python
import numpy as np
from collections import Counter
from graphviz import Digraph
class Node:
def __init__(self, feature=None, threshold=None, label=None, left=None, right=None):
self.feature = feature # 划分特征
self.threshold = threshold # 划分阈值
self.label = label # 叶节点标签
self.left = left # 左子树
self.right = right # 右子树
def calc_entropy(y):
"""计算信息熵"""
counter = Counter(y)
probs = [val / len(y) for val in counter.values()]
return -sum(p * np.log2(p) for p in probs)
def calc_cond_entropy(x, y, threshold):
"""计算条件熵"""
left_idx = np.where(x <= threshold)[0]
right_idx = np.where(x > threshold)[0]
left_probs, right_probs = len(left_idx) / len(x), len(right_idx) / len(x)
left_entropy, right_entropy = calc_entropy(y[left_idx]), calc_entropy(y[right_idx])
return left_probs * left_entropy + right_probs * right_entropy
def calc_info_gain(x, y):
"""计算信息增益"""
base_entropy = calc_entropy(y)
best_gain, best_feature, best_threshold = 0, None, None
for feature_idx in range(x.shape[1]):
for threshold in np.unique(x[:, feature_idx]):
cond_entropy = calc_cond_entropy(x[:, feature_idx], y, threshold)
gain = base_entropy - cond_entropy
if gain > best_gain:
best_gain, best_feature, best_threshold = gain, feature_idx, threshold
return best_gain, best_feature, best_threshold
def build_tree(x, y, max_depth=None, min_samples_split=2, min_impurity_decrease=0):
"""构建决策树"""
if max_depth is not None and max_depth == 0:
return Node(label=Counter(y).most_common(1)[0][0])
if len(y) < min_samples_split or calc_entropy(y) < min_impurity_decrease:
return Node(label=Counter(y).most_common(1)[0][0])
gain, feature_idx, threshold = calc_info_gain(x, y)
if gain == 0:
return Node(label=Counter(y).most_common(1)[0][0])
left_idx, right_idx = np.where(x[:, feature_idx] <= threshold)[0], np.where(x[:, feature_idx] > threshold)[0]
left = build_tree(x[left_idx], y[left_idx], max_depth=max_depth-1 if max_depth is not None else None,
min_samples_split=min_samples_split, min_impurity_decrease=min_impurity_decrease)
right = build_tree(x[right_idx], y[right_idx], max_depth=max_depth-1 if max_depth is not None else None,
min_samples_split=min_samples_split, min_impurity_decrease=min_impurity_decrease)
return Node(feature=feature_idx, threshold=threshold, left=left, right=right)
def visualize_tree(tree, dot=None):
"""可视化决策树"""
if dot is None:
dot = Digraph()
if tree.label is not None:
dot.node(str(id(tree)), label=str(tree.label), shape='box')
else:
dot.node(str(id(tree)), label=f'feature {tree.feature}\n<= {tree.threshold:.2f}')
visualize_tree(tree.left, dot)
visualize_tree(tree.right, dot)
dot.edge(str(id(tree)), str(id(tree.left)), label='True')
dot.edge(str(id(tree)), str(id(tree.right)), label='False')
return dot
```
阅读全文