python实现ID3完整代码
时间: 2024-10-28 22:09:59 浏览: 34
在Python中实现ID3算法(Iterative Dichotomiser 3),它是一种用于分类任务的决策树算法。以下是ID3算法的一个简单版本的示例代码:
```python
import math
class Node:
def __init__(self, attribute=None, value=None, is_terminal=False, children=None):
self.attribute = attribute
self.value = value
self.is_terminal = is_terminal
self.children = children if children is not None else {}
def id3_dataset(dataset, attributes):
# 函数辅助函数,计算信息增益
def entropy(data):
counts = {}
for item in data:
key = tuple(item)
if key in counts:
counts[key] += 1
else:
counts[key] = 1
total = len(data)
return sum(-count / total * math.log(count / total, 2) for count in counts.values())
def gain_ratio(data, attr):
current_entropy = entropy(data)
if all(attr(item) == attr(data[0]) for item in data): # 如果所有样本都相同,直接返回0
return 0
values, counts = zip(*sorted((attr(item), item) for item in data))
unique_values = set(values)
return current_entropy - sum(counts[i] / len(data) * entropy([item for item, val in sorted(data) if attr(item) == val[i]]) for i, val in enumerate(unique_values))
def build_tree(data, attributes):
if len(attributes) == 0 or all(item[attributes[0]] == attributes[0](data[0]) for item in data): # 判断是否达到叶节点条件
return Node(is_terminal=True, value=data[0][attributes[0]])
best_attribute = max(attributes, key=lambda a: gain_ratio(data, a)) if data else None
if best_attribute is None:
return Node(is_terminal=True, value='None') # 如果数据为空或无选择,返回叶节点
tree = Node(attribute=best_attribute)
for val, subset in groupby(sorted(data, key=lambda x: best_attribute(x)), key=best_attribute):
subtree = build_tree(list(subset), [a for a in attributes if a != best_attribute])
tree.children[val] = subtree
return tree
def groupby(iterable, key):
d = dict()
for e in iterable:
k = key(e)
try:
d[k].append(e)
except KeyError:
d[k] = [e]
return d.items()
return build_tree(dataset, list(attributes))
# 示例使用
# 假设我们有一个简单的二元分类数据集
dataset = [('Sunny', 'No'), ('Rainy', 'Yes'), ('Sunny', 'No'), ('Rainy', 'Yes'), ('Overcast', 'No'), ('Rainy', 'No')]
attributes = ['Outlook', 'Temp', 'Humidity', 'Wind']
root = id3_dataset(dataset, attributes)
def classify(tree, sample):
if tree.is_terminal:
return tree.value
else:
return classify(tree.children[sample[tree.attribute]], sample)
# 测试分类
print(classify(root, {'Outlook': 'Sunny', 'Temp': 'Hot', 'Humidity': 'High', 'Wind': 'Weak'}))
```
请注意,这个代码片段是一个简化版的ID3实现,并未包含错误处理和其他优化。实际应用中可能需要对数据预处理、特征选择以及剪枝等步骤做适当处理。
阅读全文