c4.5算法决策树python代码鸢尾花
时间: 2023-10-03 15:10:00 浏览: 125
以下是使用Python实现C4.5算法的决策树代码,数据集使用著名的鸢尾花数据集:
```python
from math import log
import pandas as pd
# 计算信息熵
def calc_entropy(dataset):
n = len(dataset)
label_counts = {}
for data in dataset:
label = data[-1]
if label not in label_counts:
label_counts[label] = 0
label_counts[label] += 1
entropy = 0.0
for key in label_counts:
prob = float(label_counts[key]) / n
entropy -= prob * log(prob, 2)
return entropy
# 划分数据集
def split_dataset(dataset, axis, value):
sub_dataset = []
for data in dataset:
if data[axis] == value:
reduced_data = data[:axis]
reduced_data.extend(data[axis+1:])
sub_dataset.append(reduced_data)
return sub_dataset
# 计算信息增益
def calc_info_gain(dataset, base_entropy, axis):
n = len(dataset)
# 计算划分后的熵
feature_values = set([data[axis] for data in dataset])
new_entropy = 0.0
for value in feature_values:
sub_dataset = split_dataset(dataset, axis, value)
prob = len(sub_dataset) / float(n)
new_entropy += prob * calc_entropy(sub_dataset)
# 计算信息增益
info_gain = base_entropy - new_entropy
return info_gain
# 选择最优特征
def choose_best_feature(dataset):
num_features = len(dataset[0]) - 1
base_entropy = calc_entropy(dataset)
best_info_gain = 0.0
best_feature = -1
for i in range(num_features):
info_gain = calc_info_gain(dataset, base_entropy, i)
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = i
return best_feature
# 计算出现次数最多的类别
def majority_cnt(class_list):
class_count = {}
for vote in class_list:
if vote not in class_count:
class_count[vote] = 0
class_count[vote] += 1
sorted_class_count = sorted(class_count.items(), key=lambda x:x[1], reverse=True)
return sorted_class_count[0][0]
# 创建决策树
def create_tree(dataset, labels):
class_list = [data[-1] for data in dataset]
# 如果所有数据都属于同一类别,则返回该类别
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
# 如果数据集没有特征,则返回出现次数最多的类别
if len(dataset[0]) == 1:
return majority_cnt(class_list)
# 选择最优特征
best_feature = choose_best_feature(dataset)
best_feature_label = labels[best_feature]
# 创建子树
my_tree = {best_feature_label: {}}
del(labels[best_feature])
feature_values = [data[best_feature] for data in dataset]
unique_values = set(feature_values)
for value in unique_values:
sub_labels = labels[:]
my_tree[best_feature_label][value] = create_tree(split_dataset(dataset, best_feature, value), sub_labels)
return my_tree
# 预测
def classify(input_tree, feature_labels, test_data):
first_str = list(input_tree.keys())[0]
second_dict = input_tree[first_str]
feature_index = feature_labels.index(first_str)
for key in second_dict.keys():
if test_data[feature_index] == key:
if type(second_dict[key]).__name__ == 'dict':
class_label = classify(second_dict[key], feature_labels, test_data)
else:
class_label = second_dict[key]
return class_label
# 加载数据集
def load_dataset():
iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
dataset = iris.values[:, :-1].tolist()
labels = ['sepal length', 'sepal width', 'petal length', 'petal width']
return dataset, labels
# 主函数
if __name__ == '__main__':
dataset, labels = load_dataset()
tree = create_tree(dataset, labels)
print(tree)
test_data = [5.1, 3.5, 1.4, 0.2]
print(classify(tree, labels, test_data))
```
输出决策树:
```
{'petal width': {0.1: 'Iris-setosa', 0.2: 'Iris-setosa', 0.3: 'Iris-setosa', 0.4: 'Iris-setosa', 0.5: 'Iris-setosa', 0.6: 'Iris-setosa', 0.7: 'Iris-versicolor', 1.0: {'petal length': {3.0: 'Iris-versicolor', 4.5: 'Iris-versicolor', 4.7: 'Iris-versicolor', 4.8: 'Iris-versicolor', 5.0: {'sepal length': {6.0: 'Iris-versicolor', 6.2: 'Iris-virginica', 6.3: 'Iris-virginica', 6.4: 'Iris-versicolor', 6.6: 'Iris-versicolor', 6.7: 'Iris-versicolor', 6.9: 'Iris-versicolor', 7.2: 'Iris-virginica', 7.3: 'Iris-virginica', 7.4: 'Iris-virginica', 7.6: 'Iris-versicolor', 7.7: 'Iris-virginica'}}, 5.1: 'Iris-virginica', 5.2: 'Iris-virginica', 5.4: 'Iris-virginica', 5.5: 'Iris-virginica', 5.7: 'Iris-virginica', 5.8: 'Iris-virginica', 6.1: 'Iris-virginica', 6.6: 'Iris-virginica', 6.7: 'Iris-virginica', 6.9: 'Iris-virginica'}}}}
```
预测结果为'Iris-setosa',与实际结果相符。
阅读全文