python中sklearn实ID3决策树
时间: 2023-07-09 18:21:20 浏览: 138
使用sklearn库实现ID3决策树的步骤如下:
1. 导入库和数据集
```python
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
```
2. 定义决策树模型
```python
dt = DecisionTreeClassifier(criterion='entropy')
```
3. 训练模型
```python
dt.fit(X, y)
```
4. 预测结果
```python
result = dt.predict([[5.1, 3.5, 1.4, 0.2]])
```
其中,criterion='entropy'表示使用信息熵作为划分标准,还可以选择使用gini系数,max_depth指定树的最大深度,min_samples_split指定划分节点时所需的最小样本数。
注意,不同的数据集和实际应用场景可能需要调整不同的参数,以获得更好的效果。
相关问题
使用Python实现实验名称:决策树算法 主要内容:设计算法实现熵、经验条件熵、信息增益等方法。实现ID3算法。熟悉sklearn库中的决策树算法; 建立对模型评价、画出决策树、混淆矩阵、ROC曲线、对属性的重要性排序。 实验数据乳腺癌数据集
好的,以下是一个简单的代码示例,可以实现熵、经验条件熵、信息增益等方法,以及使用ID3算法构建决策树。
首先,我们需要导入需要的库和数据集。这里我们使用sklearn库中的乳腺癌数据集。
```python
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
cancer = load_breast_cancer()
data = pd.DataFrame(np.c_[cancer['data'], cancer['target']], columns= np.append(cancer['feature_names'], ['target']))
```
接下来,我们可以定义一些辅助函数来计算熵、经验条件熵和信息增益。
```python
# 计算数据集的熵
def entropy(target_col):
elements, counts = np.unique(target_col, return_counts=True)
entropy = np.sum([
(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts))
for i in range(len(elements))
])
return entropy
# 计算数据集在某个属性上的经验条件熵
def conditional_entropy(data, feature, target):
elements, counts = np.unique(data[feature], return_counts=True)
conditional_entropy = np.sum([
(counts[i]/np.sum(counts)) * entropy(data.where(data[feature]==elements[i]).dropna()[target])
for i in range(len(elements))
])
return conditional_entropy
# 计算信息增益
def information_gain(data, feature, target):
return entropy(data[target]) - conditional_entropy(data, feature, target)
```
接下来,我们可以实现ID3算法来构建决策树。首先,我们可以定义一个节点类和一个树类。
```python
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature
self.threshold = threshold
self.left = left
self.right = right
self.value = value
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
```
然后,我们可以编写一个递归函数来构建树。这个函数的输入是数据集和当前深度,输出是一个节点。
```python
def build_tree(self, data, depth=0):
num_samples, num_features = data.shape
num_labels = len(np.unique(data.iloc[:, -1]))
# 如果数据集中只有一个类别,或者深度达到了最大深度,直接返回该类别
if num_labels == 1 or depth == self.max_depth:
return Node(value=data.iloc[:, -1].mode()[0])
# 选择最优的属性进行分裂
information_gains = [
information_gain(data, feature, 'target')
for feature in data.iloc[:, :-1]
]
best_feature_idx = np.argmax(information_gains)
best_feature = data.columns[best_feature_idx]
# 如果最优属性的信息增益为0,直接返回该类别
if information_gains[best_feature_idx] == 0:
return Node(value=data.iloc[:, -1].mode()[0])
# 构建左子树和右子树
elements, counts = np.unique(data[best_feature], return_counts=True)
left_data = data[data[best_feature] == elements[0]]
right_data = data[data[best_feature] == elements[1]]
left_tree = self.build_tree(left_data, depth+1)
right_tree = self.build_tree(right_data, depth+1)
# 返回一个节点
return Node(
feature=best_feature,
threshold=None,
left=left_tree,
right=right_tree,
value=None
)
```
最后,我们可以定义一个训练函数来训练模型并返回树。
```python
def fit(self, data):
self.tree = self.build_tree(data)
return self.tree
```
接下来,我们可以使用sklearn库中的DecisionTreeClassifier来构建决策树,并使用混淆矩阵和ROC曲线来评估模型。
```python
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
# 划分训练集和测试集
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# 使用sklearn库中的决策树算法
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train, y_train)
# 使用混淆矩阵和ROC曲线来评估模型
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print('混淆矩阵:')
print(cm)
print('ROC曲线下面积:', roc_auc)
```
最后,我们可以使用sklearn库中的feature_importances_属性来获得每个属性的重要性排序。
```python
# 获取每个属性的重要性排序
feature_importances = pd.DataFrame(
clf.feature_importances_,
index=X_train.columns,
columns=['importance']
).sort_values('importance', ascending=False)
print('属性的重要性排序:')
print(feature_importances)
```
完整代码如下:
给出python实现ID3决策树算法的代码,不能调用sklearn库,能够输出决策树的可视化图形
以下是Python实现ID3决策树算法的代码,可以输出决策树的可视化图形,不调用sklearn库:
```
import numpy as np
from collections import Counter
from math import log2
import matplotlib.pyplot as plt
class Node:
'''
决策树节点类
'''
def __init__(self, feature=None, label=None, is_leaf=False, branches={}):
'''
初始化
'''
self.feature = feature # 该节点对应的特征
self.label = label # 该节点对应的标签
self.is_leaf = is_leaf # 是否是叶子节点
self.branches = branches # 子节点
def __str__(self):
'''
以字符串形式返回节点信息
'''
if self.is_leaf:
return f'[leaf,label:{self.label}]'
else:
return f'[internal,feature:{self.feature}]'
class DecisionTree:
'''
决策树类
'''
def __init__(self, train_x, train_y, test_x=None, test_y=None, feature_names=None):
'''
初始化
'''
self.train_x = train_x # 训练数据特征集
self.train_y = train_y # 训练数据标签集
self.test_x = test_x # 测试数据特征集
self.test_y = test_y # 测试数据标签集
self.feature_names = feature_names # 特征名称列表
self.root = None # 决策树根节点
def fit(self):
'''
构建决策树
'''
self.root = self.build_tree(self.train_x, self.train_y, feature_names=self.feature_names)
def predict(self, x):
'''
预测分类
'''
return self.classify(x, self.root)
def build_tree(self, x, y, feature_names):
'''
递归构建决策树
'''
if len(set(y)) == 1:
# 如果标签集y只有一个标签,即类别全部相同
return Node(label=y[0], is_leaf=True)
elif len(feature_names) == 0:
# 如果特征集为空
return Node(label=Counter(y).most_common(1)[0][0], is_leaf=True)
else:
# 选择最优特征
best_feature = self.choose_best_feature(x, y)
# 创建新节点
current_node = Node(feature=best_feature)
# 根据最优特征划分数据集,并递归分别构建子树
for value in set(x[:, best_feature]):
sub_x, sub_y = self.split_dataset(x, y, best_feature, value)
if len(sub_x) == 0:
# 如果划分后的数据集为空
leaf_node = Node(label=Counter(y).most_common(1)[0][0], is_leaf=True)
current_node.branches[value] = leaf_node
else:
sub_feature_names = [name for name in feature_names if name != self.feature_names[best_feature]]
sub_tree = self.build_tree(sub_x, sub_y, feature_names=sub_feature_names)
current_node.branches[value] = sub_tree
return current_node
def choose_best_feature(self, x, y):
'''
选择最优特征
'''
n_features = x.shape[1]
# 计算信息熵
entropy = self.calc_entropy(y)
max_ig = 0 # 最大信息增益
best_feature = -1 # 最优特征
for i in range(n_features):
# 计算第i个特征的信息增益
sub_x = x[:, i]
ig = entropy - self.calc_cond_entropy(sub_x, y)
if ig > max_ig:
max_ig = ig
best_feature = i
return best_feature
def calc_entropy(self, y):
'''
计算信息熵
'''
n_samples = len(y)
counter = Counter(y)
probs = [count / n_samples for count in counter.values()]
entropy = -sum([prob * log2(prob) for prob in probs])
return entropy
def calc_cond_entropy(self, x, y):
'''
计算条件信息熵
'''
n_samples = len(y)
cond_entropy = 0
for value in set(x):
# 计算特征取值为value的样本权重
weight = sum(x == value) / n_samples
sub_y = y[x == value]
# 计算条件概率
prob = len(sub_y) / n_samples
# 计算条件信息熵
cond_entropy += weight * self.calc_entropy(sub_y) / prob
return cond_entropy
def split_dataset(self, x, y, feature_idx, value):
'''
划分数据集
'''
sub_x = x[x[:, feature_idx] == value]
sub_y = y[x[:, feature_idx] == value]
return sub_x, sub_y
def classify(self, x, node):
'''
分类
'''
if node.is_leaf:
# 如果是叶子节点,返回该节点的标签
return node.label
else:
# 向下递归分类
value = x[node.feature]
sub_tree = node.branches[value]
return self.classify(x, sub_tree)
def plot(self):
'''
可视化决策树
'''
fig, ax = plt.subplots()
self.plot_tree(ax, self.root, None)
plt.show()
def plot_tree(self, ax, node, parent_pos):
'''
绘制决策树
'''
pos = None
if parent_pos is None:
# 根节点
pos = (0.5, 1.0)
else:
# 非根节点
x, y = parent_pos
offset = 1 / (2 ** self.depth(node))
if node.feature != parent_node.feature:
pos = (parent_pos[0] + offset, parent_pos[1] - 0.1)
ax.plot([parent_pos[0], pos[0]], [parent_pos[1], pos[1]], color='r', lw='2')
else:
pos = (parent_pos[0] - offset, parent_pos[1] - 0.1)
ax.plot([parent_pos[0], pos[0]], [parent_pos[1], pos[1]], color='b', lw='2')
if node.is_leaf:
# 叶子节点
label = node.label
ax.text(pos[0], pos[1], label, horizontalalignment='center', verticalalignment='top')
else:
# 非叶子节点
feature = self.feature_names[node.feature]
ax.text(pos[0], pos[1], feature, horizontalalignment='center', verticalalignment='top')
for value, sub_node in node.branches.items():
self.plot_tree(ax, sub_node, pos)
def depth(self, node):
'''
计算树的深度
'''
if node.is_leaf:
return 0
else:
return 1 + max([self.depth(sub_node) for value, sub_node in node.branches.items()])
if __name__ == '__main__':
# 示例代码
train_x = np.array([
['sunny', 'hot', 'high', 'weak'],
['sunny', 'hot', 'high', 'strong'],
['overcast', 'hot', 'high', 'weak'],
['rainy', 'mild', 'high', 'weak'],
['rainy', 'cool', 'normal', 'weak'],
['rainy', 'cool', 'normal', 'strong'],
['overcast', 'cool', 'normal', 'strong'],
['sunny', 'mild', 'high', 'weak'],
['sunny', 'cool', 'normal', 'weak'],
['rainy', 'mild', 'normal', 'weak'],
['sunny', 'mild', 'normal', 'strong'],
['overcast', 'mild', 'high', 'strong'],
['overcast', 'hot', 'normal', 'weak'],
['rainy', 'mild', 'high', 'strong']
])
train_y = np.array(['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no'])
feature_names = ['outlook', 'temperature', 'humidity', 'windy']
tree = DecisionTree(train_x, train_y, feature_names=feature_names)
tree.fit()
tree.plot()
```
该决策树的可视化图形将会显示在窗口中。
阅读全文