id3算法python实现
时间: 2023-08-02 10:05:16 浏览: 50
以下是一个简单的 Python 实现 ID3 算法的例子:
```python
import math
from collections import Counter
def entropy(data):
"""计算数据集的熵"""
counter = Counter(data)
probs = [counter[c] / len(data) for c in set(data)]
return -sum(p * math.log2(p) for p in probs)
def information_gain(data, split_attribute_name, target_attribute_name):
"""计算信息增益"""
original_entropy = entropy(data[target_attribute_name])
values = set(data[split_attribute_name])
new_entropy = 0
for value in values:
subset = data[data[split_attribute_name] == value]
new_entropy += len(subset) / len(data) * entropy(subset[target_attribute_name])
return original_entropy - new_entropy
def id3(data, original_data, features, target_attribute_name, parent_node_class = None):
"""构造决策树"""
# 如果数据集为空,返回父节点的类别
if len(data) == 0:
return parent_node_class
# 如果所有实例都属于同一个类别,返回该类别
elif len(set(data[target_attribute_name])) == 1:
return data[target_attribute_name].iloc[0]
# 如果没有特征可用于分裂,则返回数据集中最常见的类别
elif len(features) == 0:
return parent_node_class
# 否则,选择具有最高信息增益的特征
else:
parent_node_class = data[target_attribute_name].value_counts().idxmax()
item_values = [information_gain(data, feature, target_attribute_name) for feature in features]
best_feature_index = item_values.index(max(item_values))
best_feature = features[best_feature_index]
# 构建树
tree = {best_feature:{}}
features = [i for i in features if i != best_feature]
for value in set(data[best_feature]):
subset = data[data[best_feature] == value].drop(columns=[best_feature])
subtree = id3(subset, original_data, features, target_attribute_name, parent_node_class)
tree[best_feature][value] = subtree
return tree
# 示例
data = pd.DataFrame({
'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong', 'Weak'],
'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
})
tree = id3(data, data, data.columns[:-1], 'Play')
```
这个例子使用 pandas 存储数据,并且假设数据集中的最后一列是目标属性。请注意,这个实现版本不包括剪枝。