id3决策树的python实现
时间: 2023-10-27 09:05:40 浏览: 93
好的,关于id3决策树的Python实现,你可以参考以下代码:
```python
import math
import pandas as pd
class Node:
def __init__(self, feature=None, value=None, result=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.value = value # 划分数据集的特征值
self.result = result # 叶子节点表示的结果
self.left = left # 左子树
self.right = right # 右子树
class DecisionTree:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon # 阈值
def calc_entropy(self, data):
"""
计算数据集的熵
"""
num_entries = len(data)
label_counts = {}
for feat_vec in data:
current_label = feat_vec[-1]
if current_label not in label_counts.keys():
label_counts[current_label] = 0
label_counts[current_label] += 1
entropy = 0.0
for key in label_counts:
prob = float(label_counts[key]) / num_entries
entropy -= prob * math.log(prob, 2)
return entropy
def split_data(self, data, axis, value):
"""
按照给定特征划分数据集
"""
ret_data = []
for feat_vec in data:
if feat_vec[axis] == value:
reduced_feat_vec = feat_vec[:axis]
reduced_feat_vec.extend(feat_vec[axis+1:])
ret_data.append(reduced_feat_vec)
return ret_data
def choose_best_feature(self, data):
"""
选择最好的数据集划分方式
"""
num_features = len(data[0]) - 1
base_entropy = self.calc_entropy(data)
best_info_gain = 0.0
best_feature = -1
for i in range(num_features):
feat_list = [example[i] for example in data]
unique_vals = set(feat_list)
new_entropy = 0.0
for value in unique_vals:
sub_data = self.split_data(data, i, value)
prob = len(sub_data) / float(len(data))
new_entropy += prob * self.calc_entropy(sub_data)
info_gain = base_entropy - new_entropy
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = i
return best_feature
def create_tree(self, data):
"""
创建决策树
"""
class_list = [example[-1] for example in data]
# 如果类别完全相同则停止继续划分
if class_list.count(class_list[0]) == len(class_list):
return Node(result=class_list[0])
# 遍历完所有特征时返回出现次数最多的类别
if len(data[0]) == 1:
return Node(result=self.majority_cnt(class_list))
best_feat = self.choose_best_feature(data)
best_feat_label = labels[best_feat]
my_tree = Node(feature=best_feat_label)
del(labels[best_feat])
feat_values = [example[best_feat] for example in data]
unique_vals = set(feat_values)
for value in unique_vals:
sub_labels = labels[:]
my_tree.__dict__['left'] = self.create_tree(self.split_data(data, best_feat, value))
my_tree.__dict__['right'] = self.create_tree(self.split_data(data, best_feat, value))
return my_tree
def majority_cnt(self, class_list):
"""
返回出现次数最多的类别
"""
class_count = {}
for vote in class_list:
if vote not in class_count.keys():
class_count[vote] = 0
class_count[vote] += 1
sorted_class_count = sorted(class_count.items(), key=lambda x: x[1], reverse=True)
return sorted_class_count[0][0]
def fit(self, X_train, y_train):
"""
训练模型
"""
data = pd.concat([X_train, y_train], axis=1).values.tolist()
labels = X_train.columns.tolist()
self.tree = self.create_tree(data, labels)
def predict(self, X_test):
"""
预测
"""
result = []
for i in range(len(X_test)):
result.append(self._predict(X_test.iloc[i]))
return result
def _predict(self, test_vec):
"""
预测单个样本
"""
node = self.tree
while node.result is None:
if test_vec[node.feature] == node.value:
node = node.left
else:
node = node.right
return node.result
```
阅读全文