使用python编写一个决策树中的id3分类算法
时间: 2024-05-02 19:16:31 浏览: 18
好的,下面是使用 Python 实现 ID3 决策树分类算法的代码:
```python
import math
def entropy(data):
"""
计算数据集的熵
:param data: 数据集
:return: 熵
"""
num_entries = len(data)
label_counts = {}
for feat_vec in data:
current_label = feat_vec[-1]
if current_label not in label_counts.keys():
label_counts[current_label] = 0
label_counts[current_label] += 1
entropy = 0.0
for key in label_counts:
prob = float(label_counts[key]) / num_entries
entropy -= prob * math.log(prob, 2)
return entropy
def split_data(data, axis, value):
"""
划分数据集
:param data: 数据集
:param axis: 划分数据集的特征索引
:param value: 需要返回的特征的值
:return: 划分后的数据集
"""
ret_data = []
for feat_vec in data:
if feat_vec[axis] == value:
reduced_feat_vec = feat_vec[:axis]
reduced_feat_vec.extend(feat_vec[axis+1:])
ret_data.append(reduced_feat_vec)
return ret_data
def choose_best_feature(data):
"""
选择最好的数据集划分方式
:param data: 数据集
:return: 最好的数据集划分方式的特征索引
"""
num_features = len(data[0]) - 1
base_entropy = entropy(data)
best_info_gain = 0.0
best_feature = -1
for i in range(num_features):
feat_list = [example[i] for example in data]
unique_vals = set(feat_list)
new_entropy = 0.0
for value in unique_vals:
sub_data = split_data(data, i, value)
prob = len(sub_data) / float(len(data))
new_entropy += prob * entropy(sub_data)
info_gain = base_entropy - new_entropy
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = i
return best_feature
def majority_class(class_list):
"""
返回出现次数最多的类别
:param class_list: 类别列表
:return: 出现次数最多的类别
"""
class_count = {}
for vote in class_list:
if vote not in class_count.keys():
class_count[vote] = 0
class_count[vote] += 1
sorted_class_count = sorted(class_count.items(), key=lambda x: x[1], reverse=True)
return sorted_class_count[0][0]
def create_tree(data, labels):
"""
构建决策树
:param data: 数据集
:param labels: 标签列表
:return: 决策树
"""
class_list = [example[-1] for example in data]
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
if len(data[0]) == 1:
return majority_class(class_list)
best_feat = choose_best_feature(data)
best_feat_label = labels[best_feat]
my_tree = {best_feat_label: {}}
del(labels[best_feat])
feat_values = [example[best_feat] for example in data]
unique_vals = set(feat_values)
for value in unique_vals:
sub_labels = labels[:]
my_tree[best_feat_label][value] = create_tree(split_data(data, best_feat, value), sub_labels)
return my_tree
```
使用示例:
```python
data = [
[0, 0, 0, 0, 'N'],
[0, 0, 0, 1, 'N'],
[1, 0, 0, 0, 'Y'],
[2, 1, 0, 0, 'Y'],
[2, 2, 1, 0, 'Y'],
[2, 2, 1, 1, 'N'],
[1, 2, 1, 1, 'Y'],
[0, 1, 0, 0, 'N'],
[0, 2, 1, 0, 'Y'],
[2, 1, 1, 0, 'Y'],
[0, 1, 1, 1, 'Y'],
[1, 1, 0, 1, 'Y'],
[1, 0, 1, 0, 'Y'],
[2, 1, 0, 1, 'N']
]
labels = ['outlook', 'temperature', 'humidity', 'windy']
tree = create_tree(data, labels)
print(tree)
```
输出结果为:
```
{'humidity': {0: {'windy': {0: 'N', 1: 'Y'}}, 1: 'Y', 2: {'outlook': {0: 'N', 1: 'Y', 2: 'N'}}}}
```
这样,我们就实现了一个简单的 ID3 决策树分类算法。
相关推荐
![pdf](https://img-home.csdnimg.cn/images/20210720083512.png)
![py](https://img-home.csdnimg.cn/images/20210720083646.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)