请给我编写实现ID3算法的代码
时间: 2024-05-07 17:21:24 浏览: 71
好的,下面是一个简单的Python实现ID3算法的代码:
```python
import math
class Node:
def __init__(self, attribute=None, value=None, label=None):
self.attribute = attribute # 分裂属性
self.value = value # 分裂属性值
self.label = label # 叶节点分类
self.children = {} # 子节点,字典类型,key为属性值,value为Node类型
def is_leaf(self):
return self.label is not None
class DecisionTree:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon # 决策树生成最小信息增益
def calc_entropy(self, data):
num_samples = len(data)
label_counts = {}
for sample in data:
label = sample[-1]
if label not in label_counts:
label_counts[label] = 0
label_counts[label] += 1
entropy = 0.0
for label in label_counts:
prob = float(label_counts[label]) / num_samples
entropy -= prob * math.log(prob, 2)
return entropy
def split_data(self, data, axis, value):
subset = []
for sample in data:
if sample[axis] == value:
reduced_sample = sample[:axis]
reduced_sample.extend(sample[axis+1:])
subset.append(reduced_sample)
return subset
def choose_best_feature(self, data):
num_features = len(data[0]) - 1
base_entropy = self.calc_entropy(data)
best_info_gain = 0.0
best_feature = -1
for i in range(num_features):
feature_list = [sample[i] for sample in data]
unique_values = set(feature_list)
new_entropy = 0.0
for value in unique_values:
subset = self.split_data(data, i, value)
prob = len(subset) / float(len(data))
new_entropy += prob * self.calc_entropy(subset)
info_gain = base_entropy - new_entropy
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = i
return best_feature
def create_tree(self, data, attributes):
class_list = [sample[-1] for sample in data]
if class_list.count(class_list[0]) == len(class_list):
return Node(label=class_list[0])
if len(data[0]) == 1:
return Node(label=self.majority_vote(class_list))
best_feature = self.choose_best_feature(data)
best_attr = attributes[best_feature]
tree = Node(attribute=best_attr)
del(attributes[best_feature])
feature_list = [sample[best_feature] for sample in data]
unique_values = set(feature_list)
for value in unique_values:
subset = self.split_data(data, best_feature, value)
subtree = self.create_tree(subset, attributes[:])
tree.children[value] = subtree
return tree
def majority_vote(self, class_list):
class_counts = {}
for vote in class_list:
if vote not in class_counts:
class_counts[vote] = 0
class_counts[vote] += 1
sorted_class_counts = sorted(class_counts.items(), key=lambda x:x[1], reverse=True)
return sorted_class_counts[0][0]
def train(self, data):
attributes = [i for i in range(len(data[0])-1)]
self.tree = self.create_tree(data, attributes)
def predict(self, sample):
node = self.tree
while not node.is_leaf():
if sample[node.attribute] in node.children:
node = node.children[sample[node.attribute]]
else:
return None
return node.label
```
其中,`Node`类表示决策树的节点,`DecisionTree`类表示决策树本身。
`calc_entropy`函数计算数据集的熵,`split_data`函数根据某一属性值将数据集划分成子集,`choose_best_feature`函数选择最佳划分属性,`create_tree`函数递归生成决策树。
`train`函数用于训练决策树,`predict`函数用于预测新的样本分类。
阅读全文