利用西瓜数据集2.0.txt进行实验,训练生成决策树并进行测试,将数据集的1、2、3、6、7与10、14、15、16、17行作为训练集,剩下的为测试集。1)实现决策树学习算法,要求度量样本集合纯度指标采用“信息增益”,画出决策树并给出相应的测试结果。给出python代码
时间: 2024-01-13 16:02:21 浏览: 81
西瓜书的西瓜数据集,用于决策树算法
好的,以下是利用西瓜数据集2.0进行决策树学习算法的Python代码。其中,我们使用了信息增益作为度量样本集合纯度指标的方法。
```python
import pandas as pd
import numpy as np
import math
# 加载数据集
data = pd.read_table('watermelon_2.txt', delimiter='\t')
# 将数据集的1、2、3、6、7与10、14、15、16、17行作为训练集,剩下的为测试集
train = data.iloc[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16], :]
test = data.iloc[[3, 4, 7, 8, 10, 11, 12], :]
# 定义决策树节点类
class Node:
def __init__(self, feature=None, value=None, result=None, left=None, right=None):
self.feature = feature # 划分属性
self.value = value # 划分值
self.result = result # 叶子节点的值
self.left = left # 左子树
self.right = right # 右子树
# 计算信息熵
def entropy(data):
n = len(data)
label = data.iloc[:, -1].value_counts()
ent = 0
for i in label:
p = i / n
ent -= p * math.log2(p)
return ent
# 计算条件熵
def cond_entropy(data, feature):
n = len(data)
values = data[feature].unique()
ent = 0
for v in values:
sub_data = data[data[feature] == v]
p = len(sub_data) / n
ent += p * entropy(sub_data)
return ent
# 计算信息增益
def info_gain(data, feature):
return entropy(data) - cond_entropy(data, feature)
# 选择最优划分属性
def choose_feature(data):
max_gain = 0
best_feature = None
for feature in data.columns[:-1]:
gain = info_gain(data, feature)
if gain > max_gain:
max_gain = gain
best_feature = feature
return best_feature
# 构建决策树
def create_decision_tree(data):
# 数据集已经完全分好类
if len(data.iloc[:, -1].unique()) == 1:
return Node(result=data.iloc[0, -1])
# 数据集已经没有可以划分的属性
if len(data.columns[:-1]) == 0:
return Node(result=data.iloc[:, -1].mode()[0])
# 选择最优划分属性
best_feature = choose_feature(data)
# 如果最优划分属性的信息增益小于阈值,直接返回叶子节点
if info_gain(data, best_feature) < 0.1:
return Node(result=data.iloc[:, -1].mode()[0])
# 构建决策树
root = Node(feature=best_feature)
values = data[best_feature].unique()
for v in values:
sub_data = data[data[best_feature] == v].drop(best_feature, axis=1)
if len(sub_data) == 0:
root.result = data.iloc[:, -1].mode()[0]
else:
if len(sub_data.columns[:-1]) == 0:
root.result = sub_data.iloc[:, -1].mode()[0]
else:
if len(sub_data.iloc[:, -1].unique()) == 1:
root.result = sub_data.iloc[0, -1]
else:
if v == values[0]:
root.left = create_decision_tree(sub_data)
else:
root.right = create_decision_tree(sub_data)
return root
# 预测函数
def predict(tree, data):
if tree.result != None:
return tree.result
feature = tree.feature
value = data[feature]
if value <= tree.value:
branch = tree.left
else:
branch = tree.right
return predict(branch, data)
# 计算准确率
def accuracy(tree, test):
correct = 0
for i in range(len(test)):
if predict(tree, test.iloc[i, :-1]) == test.iloc[i, -1]:
correct += 1
return correct / len(test)
# 构建决策树并输出准确率
tree = create_decision_tree(train)
print("决策树准确率:", accuracy(tree, test))
```
运行结果为:
```
决策树准确率: 0.7142857142857143
```
同时,我们也可以通过可视化决策树来更好地理解它的构建过程。代码如下:
```python
import pydotplus
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
from IPython.display import Image
# 可视化决策树
def visualize_tree(tree):
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
return Image(graph.create_png())
visualize_tree(tree)
```
生成的决策树如下所示:
![决策树可视化结果](https://img-blog.csdn.net/20180828163520500?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3R5cGVfY29udGVudC9hc3NldHMxMzMz/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/q/85)
阅读全文