决策树鸢尾花python
时间: 2023-11-08 14:05:14 浏览: 180
决策树是一种基本的分类和回归方法,它通过对数据进行分割来构建一棵树形结构,每个叶子节点代表一个类别或者一个数值。鸢尾花数据集是一个经典的分类问题,它包含了三种不同种类的鸢尾花,每种鸢尾花有四个特征:花萼长度、花萼宽度、花瓣长度和花瓣宽度。在Python中,我们可以使用scikit-learn库来构建决策树模型并对鸢尾花数据集进行分类。
具体来说,我们可以使用sklearn.tree模块中的DecisionTreeClassifier类来构建决策树模型。首先,我们需要将鸢尾花数据集分为训练集和测试集,然后使用fit()方法来拟合模型,最后使用predict()方法来对测试集进行预测并计算准确率。
需要注意的是,在构建决策树模型时,我们需要考虑连续值和缺失值的处理。对于连续值,我们可以使用二分法或者信息增益来进行处理;对于缺失值,我们可以使用均值、中位数或者众数来进行填充。
相关问题
决策树鸢尾花 python 信息熵
### 使用 Python 实现基于信息熵的决策树分类鸢尾花数据集
#### 导入必要的库
为了实现这一目标,首先需要导入一些常用的Python库来处理数据和构建模型。
```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
```
#### 加载并探索数据集
加载鸢尾花数据集,并查看其基本信息以便理解所要处理的数据结构。
```python
# 加载鸢尾花数据集
iris = load_iris()
# 将特征名称转换为DataFrame形式方便操作
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# 添加类别标签列
df['target'] = iris.target
print(df.head())
```
#### 划分训练集与测试集
将原始数据划分为训练集和测试集两部分,通常采用70%-80%作为训练比例较为合适。
```python
X_train, X_test, y_train, y_test = train_test_split(
df.drop('target', axis=1), df['target'], test_size=0.3, random_state=42)
```
#### 构建决策树模型
创建`DecisionTreeClassifier`实例时指定参数`criterion='entropy'`以确保使用信息熵作为质量度量标准[^1]。
```python
clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100,
max_depth=3, min_samples_leaf=5)
```
#### 训练模型
使用训练集对上述定义好的决策树分类器进行拟合。
```python
clf_entropy.fit(X_train.values, y_train.values)
```
#### 测试模型性能
最后一步是对建立起来的模型做验证工作,即用之前预留出来的那部分未见过的新样本来检验它的好坏程度。
```python
y_pred = clf_entropy.predict(X_test.values)
accuracy = sum(y_pred == y_test)/len(y_test)*100.
print(f"Accuracy of the model is {accuracy:.2f}%")
```
#### 特征重要性分析
了解哪些输入变量对于预测输出贡献最大可以帮助我们更好地解释模型的工作机制。
```python
importances = pd.DataFrame({
'feature': X_train.columns,
'importance': np.round(clf_entropy.feature_importances_, 3)
})
print(importances.sort_values(by='importance', ascending=False))
```
id3决策树 鸢尾花 python_C4.5决策树Python代码实现
id3决策树 鸢尾花 Python代码实现:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature=None, target=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.target = target # 叶子节点的类别
self.left = left # 左子节点
self.right = right # 右子节点
class ID3DecisionTree:
def __init__(self):
self.tree = None # 决策树
# 计算信息熵
def _entropy(self, y):
labels = np.unique(y)
probs = [np.sum(y == label) / len(y) for label in labels]
return -np.sum([p * np.log2(p) for p in probs])
# 计算条件熵
def _conditional_entropy(self, X, y, feature):
feature_values = np.unique(X[:, feature])
probs = [np.sum(X[:, feature] == value) / len(X) for value in feature_values]
entropies = [self._entropy(y[X[:, feature] == value]) for value in feature_values]
return np.sum([p * e for p, e in zip(probs, entropies)])
# 选择最优特征
def _select_feature(self, X, y):
n_features = X.shape[1]
entropies = [self._conditional_entropy(X, y, feature) for feature in range(n_features)]
return np.argmin(entropies)
# 构建决策树
def _build_tree(self, X, y):
if len(np.unique(y)) == 1: # 叶子节点,返回类别
return Node(target=y[0])
if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
feature = self._select_feature(X, y) # 选择最优特征
feature_values = np.unique(X[:, feature])
left_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[0]]
right_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[1]]
left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树
right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树
return Node(feature=feature, left=left, right=right)
# 训练决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
# 预测单个样本
def _predict_sample(self, x):
node = self.tree
while node.target is None:
if x[node.feature] == np.unique(X[:, node.feature])[0]:
node = node.left
else:
node = node.right
return node.target
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# 训练模型
model = ID3DecisionTree()
model.fit(train_X, train_y)
# 预测测试集
pred_y = model.predict(test_X)
# 计算准确率
accuracy = np.sum(pred_y == test_y) / len(test_y)
print('Accuracy:', accuracy)
```
C4.5决策树 Python代码实现:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature=None, threshold=None, target=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.threshold = threshold # 划分数据集的阈值
self.target = target # 叶子节点的类别
self.left = left # 左子节点
self.right = right # 右子节点
class C45DecisionTree:
def __init__(self, min_samples_split=2, min_gain_ratio=1e-4):
self.min_samples_split = min_samples_split # 最小划分样本数
self.min_gain_ratio = min_gain_ratio # 最小增益比
self.tree = None # 决策树
# 计算信息熵
def _entropy(self, y):
labels = np.unique(y)
probs = [np.sum(y == label) / len(y) for label in labels]
return -np.sum([p * np.log2(p) for p in probs])
# 计算条件熵
def _conditional_entropy(self, X, y, feature, threshold):
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
left_probs = np.sum(left_indices) / len(X)
right_probs = np.sum(right_indices) / len(X)
entropies = [self._entropy(y[left_indices]), self._entropy(y[right_indices])]
return np.sum([p * e for p, e in zip([left_probs, right_probs], entropies)])
# 计算信息增益
def _information_gain(self, X, y, feature, threshold):
entropy = self._entropy(y)
conditional_entropy = self._conditional_entropy(X, y, feature, threshold)
return entropy - conditional_entropy
# 计算信息增益比
def _gain_ratio(self, X, y, feature, threshold):
entropy = self._entropy(y)
conditional_entropy = self._conditional_entropy(X, y, feature, threshold)
split_info = -np.sum([p * np.log2(p) for p in [np.sum(X[:, feature] <= threshold) / len(X), np.sum(X[:, feature] > threshold) / len(X)]])
return (entropy - conditional_entropy) / split_info if split_info != 0 else 0
# 选择最优特征和划分阈值
def _select_feature_and_threshold(self, X, y):
n_features = X.shape[1]
max_gain_ratio = -1
best_feature, best_threshold = None, None
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
if len(y[X[:, feature] <= threshold]) >= self.min_samples_split and len(y[X[:, feature] > threshold]) >= self.min_samples_split:
gain_ratio = self._gain_ratio(X, y, feature, threshold)
if gain_ratio > max_gain_ratio:
max_gain_ratio = gain_ratio
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
# 构建决策树
def _build_tree(self, X, y):
if len(np.unique(y)) == 1: # 叶子节点,返回类别
return Node(target=y[0])
if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
feature, threshold = self._select_feature_and_threshold(X, y) # 选择最优特征和划分阈值
if feature is None or threshold is None: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树
right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树
return Node(feature=feature, threshold=threshold, left=left, right=right)
# 训练决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
# 预测单个样本
def _predict_sample(self, x):
node = self.tree
while node.target is None:
if x[node.feature] <= node.threshold:
node = node.left
else:
node = node.right
return node.target
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# 训练模型
model = C45DecisionTree(min_samples_split=5)
model.fit(train_X, train_y)
# 预测测试集
pred_y = model.predict(test_X)
# 计算准确率
accuracy = np.sum(pred_y == test_y) / len(test_y)
print('Accuracy:', accuracy)
```
阅读全文
相关推荐














