ID3算法的Python实现
时间: 2023-11-19 18:52:14 浏览: 28
ID3算法是一种决策树算法,用于分类和预测。下面是一个简单的Python实现:
```python
import math
from collections import Counter
def entropy(data):
"""计算数据集的熵"""
n = len(data)
label_counts = Counter([label for _, label in data])
probs = [count / n for count in label_counts.values()]
return -sum(p * math.log2(p) for p in probs)
def split_data(data, feature):
"""按照某个特征划分数据集"""
groups = {}
for sample in data:
key = sample[0][feature]
if key not in groups:
groups[key] = []
groups[key].append(sample)
return groups
def choose_best_feature(data):
"""选择最佳划分特征"""
base_entropy = entropy(data)
best_info_gain, best_feature = 0, -1
for i in range(len(data[0][0])):
sub_data = [(x, y) for x, y in data if x[i] is not None]
entropy_i = sum(len(subset) / len(data) * entropy(subset) for subset in split_data(sub_data, i).values())
info_gain = base_entropy - entropy_i
if info_gain > best_info_gain:
best_info_gain, best_feature = info_gain, i
return best_feature
def majority_label(labels):
"""多数表决决定叶节点的分类"""
label_counts = Counter(labels)
return label_counts.most_common(1)[0][0]
def create_tree(data, features):
"""递归构建决策树"""
labels = [label for _, label in data]
if len(set(labels)) == 1:
return labels[0]
if not features:
return majority_label(labels)
best_feature = choose_best_feature(data)
tree = {features[best_feature]: {}}
for key, subset in split_data(data, best_feature).items():
tree[features[best_feature]][key] = create_tree([(x[:best_feature] + x[best_feature+1:], y) for x, y in subset], features[:best_feature] + features[best_feature+1:])
return tree
```