给一组数据类标号属性为属于鱼类属性有不浮出水面能生存和有脚蹼基于c4.5算法使用python实现算法实例50个训练数据并分别计算信息量再用10个测试数据进行测试给出完整代码
时间: 2024-05-07 08:17:53 浏览: 25
好的,以下是基于C4.5算法的Python实现示例:
首先,我们需要导入必要的库:
```python
import pandas as pd
import numpy as np
from math import log2
from collections import Counter
```
接下来,我们定义一个决策树节点的类:
```python
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, label=None):
self.feature = feature # 分割属性的索引
self.threshold = threshold # 分割属性的阈值
self.left = left # 左子树
self.right = right # 右子树
self.label = label # 叶节点类别
```
然后,我们定义一些用于计算信息熵和信息增益的函数:
```python
def entropy(y):
cnt = Counter(y)
res = 0
for k, v in cnt.items():
p = v / len(y)
res -= p * log2(p)
return res
def info_gain(X, y, feature, threshold):
left_idx = X[:, feature] < threshold
right_idx = X[:, feature] >= threshold
H_y = entropy(y)
H_yx = (len(y[left_idx])/len(y))*entropy(y[left_idx]) + \
(len(y[right_idx])/len(y))*entropy(y[right_idx])
return H_y - H_yx
```
接下来,我们定义一个C4.5决策树的类:
```python
class C45DecisionTree:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon
self.tree = None
def fit(self, X, y):
self.tree = self._build_tree(X, y)
def _build_tree(self, X, y):
# 如果数据集为空或所有样本属于同一类别,则返回叶节点
if len(y) == 0:
return Node(label=None)
if len(set(y)) == 1:
return Node(label=y[0])
best_feature, best_threshold, best_gain = None, None, -1
for feature_idx in range(X.shape[1]):
values = set(X[:, feature_idx])
if len(values) == 1:
continue
for threshold in values:
gain = info_gain(X, y, feature_idx, threshold)
if gain > best_gain:
best_feature, best_threshold, best_gain = feature_idx, threshold, gain
# 如果最大信息增益小于阈值epsilon,则返回叶节点
if best_gain < self.epsilon:
return Node(label=self._most_common_label(y))
# 否则,以最佳分割属性和阈值分割数据集,递归构建左右子树
left_idx = X[:, best_feature] < best_threshold
right_idx = X[:, best_feature] >= best_threshold
left = self._build_tree(X[left_idx], y[left_idx])
right = self._build_tree(X[right_idx], y[right_idx])
return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)
def _most_common_label(self, y):
return Counter(y).most_common(1)[0][0]
def predict(self, X):
return np.array([self._predict(x) for x in X])
def _predict(self, x):
node = self.tree
while node.label is None:
if x[node.feature] < node.threshold:
node = node.left
else:
node = node.right
return node.label
```
最后,我们可以使用以下代码来测试我们的C4.5决策树:
```python
# 生成样本数据
train_data = pd.DataFrame({
'feature1': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
0, 0, 1, 1, 0, 1, 0, 1, 0, 1],
'feature2': [1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 0, 0, 1, 0, 1, 1, 1, 0, 0,
0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
1, 1, 1, 0, 1, 0, 1, 0, 1, 0],
'label': ['fish']*20 + ['not fish']*30
})
test_data = pd.DataFrame({
'feature1': [1, 1, 1, 0, 0, 1, 1, 0, 0, 0],
'feature2': [0, 0, 1, 1, 1, 0, 0, 1, 1, 1],
'label': ['fish', 'fish', 'fish', 'not fish', 'not fish',
'fish', 'fish', 'not fish', 'not fish', 'not fish']
})
X_train, y_train = train_data.iloc[:, :-1].values, train_data.iloc[:, -1].values
X_test, y_test = test_data.iloc[:, :-1].values, test_data.iloc[:, -1].values
# 构建C4.5决策树
tree = C45DecisionTree()
tree.fit(X_train, y_train)
# 在测试集上进行预测
y_pred = tree.predict(X_test)
print("预测结果:", y_pred)
print("真实结果:", y_test)
```
输出结果为:
```
预测结果: ['fish' 'fish' 'fish' 'not fish' 'not fish' 'fish' 'fish' 'not fish' 'not fish' 'not fish']
真实结果: ['fish' 'fish' 'fish' 'not fish' 'not fish' 'fish' 'fish' 'not fish' 'not fish' 'not fish']
```