决策树鸢尾花python

决策树是一种基本的分类和回归方法，它通过对数据进行分割来构建一棵树形结构，每个叶子节点代表一个类别或者一个数值。鸢尾花数据集是一个经典的分类问题，它包含了三种不同种类的鸢尾花，每种鸢尾花有四个特征：花萼长度、花萼宽度、花瓣长度和花瓣宽度。在Python中，我们可以使用scikit-learn库来构建决策树模型并对鸢尾花数据集进行分类。具体来说，我们可以使用sklearn.tree模块中的DecisionTreeClassifier类来构建决策树模型。首先，我们需要将鸢尾花数据集分为训练集和测试集，然后使用fit()方法来拟合模型，最后使用predict()方法来对测试集进行预测并计算准确率。需要注意的是，在构建决策树模型时，我们需要考虑连续值和缺失值的处理。对于连续值，我们可以使用二分法或者信息增益来进行处理；对于缺失值，我们可以使用均值、中位数或者众数来进行填充。

决策树鸢尾花 python 信息熵

### 使用 Python 实现基于信息熵的决策树分类鸢尾花数据集 #### 导入必要的库为了实现这一目标，首先需要导入一些常用的Python库来处理数据和构建模型。 ```python from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier import pandas as pd import numpy as np ``` #### 加载并探索数据集加载鸢尾花数据集，并查看其基本信息以便理解所要处理的数据结构。 ```python # 加载鸢尾花数据集 iris = load_iris() # 将特征名称转换为DataFrame形式方便操作 df = pd.DataFrame(data=iris.data, columns=iris.feature_names) # 添加类别标签列 df['target'] = iris.target print(df.head()) ``` #### 划分训练集与测试集将原始数据划分为训练集和测试集两部分，通常采用70%-80%作为训练比例较为合适。 ```python X_train, X_test, y_train, y_test = train_test_split( df.drop('target', axis=1), df['target'], test_size=0.3, random_state=42) ``` #### 构建决策树模型创建`DecisionTreeClassifier`实例时指定参数`criterion='entropy'`以确保使用信息熵作为质量度量标准[^1]。 ```python clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5) ``` #### 训练模型使用训练集对上述定义好的决策树分类器进行拟合。 ```python clf_entropy.fit(X_train.values, y_train.values) ``` #### 测试模型性能最后一步是对建立起来的模型做验证工作，即用之前预留出来的那部分未见过的新样本来检验它的好坏程度。 ```python y_pred = clf_entropy.predict(X_test.values) accuracy = sum(y_pred == y_test)/len(y_test)*100. print(f"Accuracy of the model is {accuracy:.2f}%") ``` #### 特征重要性分析了解哪些输入变量对于预测输出贡献最大可以帮助我们更好地解释模型的工作机制。 ```python importances = pd.DataFrame({ 'feature': X_train.columns, 'importance': np.round(clf_entropy.feature_importances_, 3) }) print(importances.sort_values(by='importance', ascending=False)) ```

id3决策树鸢尾花 python_C4.5决策树Python代码实现

id3决策树鸢尾花 Python代码实现： ```python import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split class Node: def __init__(self, feature=None, target=None, left=None, right=None): self.feature = feature # 划分数据集的特征 self.target = target # 叶子节点的类别 self.left = left # 左子节点 self.right = right # 右子节点 class ID3DecisionTree: def __init__(self): self.tree = None # 决策树 # 计算信息熵 def _entropy(self, y): labels = np.unique(y) probs = [np.sum(y == label) / len(y) for label in labels] return -np.sum([p * np.log2(p) for p in probs]) # 计算条件熵 def _conditional_entropy(self, X, y, feature): feature_values = np.unique(X[:, feature]) probs = [np.sum(X[:, feature] == value) / len(X) for value in feature_values] entropies = [self._entropy(y[X[:, feature] == value]) for value in feature_values] return np.sum([p * e for p, e in zip(probs, entropies)]) # 选择最优特征 def _select_feature(self, X, y): n_features = X.shape[1] entropies = [self._conditional_entropy(X, y, feature) for feature in range(n_features)] return np.argmin(entropies) # 构建决策树 def _build_tree(self, X, y): if len(np.unique(y)) == 1: # 叶子节点，返回类别 return Node(target=y[0]) if X.shape[1] == 0: # 叶子节点，返回出现次数最多的类别 target = np.argmax(np.bincount(y)) return Node(target=target) feature = self._select_feature(X, y) # 选择最优特征 feature_values = np.unique(X[:, feature]) left_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[0]] right_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[1]] left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树 right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树 return Node(feature=feature, left=left, right=right) # 训练决策树 def fit(self, X, y): self.tree = self._build_tree(X, y) # 预测单个样本 def _predict_sample(self, x): node = self.tree while node.target is None: if x[node.feature] == np.unique(X[:, node.feature])[0]: node = node.left else: node = node.right return node.target # 预测多个样本 def predict(self, X): return np.array([self._predict_sample(x) for x in X]) # 加载鸢尾花数据集 iris = load_iris() X = iris.data y = iris.target # 划分数据集 train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1) # 训练模型 model = ID3DecisionTree() model.fit(train_X, train_y) # 预测测试集 pred_y = model.predict(test_X) # 计算准确率 accuracy = np.sum(pred_y == test_y) / len(test_y) print('Accuracy:', accuracy) ``` C4.5决策树 Python代码实现： ```python import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split class Node: def __init__(self, feature=None, threshold=None, target=None, left=None, right=None): self.feature = feature # 划分数据集的特征 self.threshold = threshold # 划分数据集的阈值 self.target = target # 叶子节点的类别 self.left = left # 左子节点 self.right = right # 右子节点 class C45DecisionTree: def __init__(self, min_samples_split=2, min_gain_ratio=1e-4): self.min_samples_split = min_samples_split # 最小划分样本数 self.min_gain_ratio = min_gain_ratio # 最小增益比 self.tree = None # 决策树 # 计算信息熵 def _entropy(self, y): labels = np.unique(y) probs = [np.sum(y == label) / len(y) for label in labels] return -np.sum([p * np.log2(p) for p in probs]) # 计算条件熵 def _conditional_entropy(self, X, y, feature, threshold): left_indices = X[:, feature] <= threshold right_indices = X[:, feature] > threshold left_probs = np.sum(left_indices) / len(X) right_probs = np.sum(right_indices) / len(X) entropies = [self._entropy(y[left_indices]), self._entropy(y[right_indices])] return np.sum([p * e for p, e in zip([left_probs, right_probs], entropies)]) # 计算信息增益 def _information_gain(self, X, y, feature, threshold): entropy = self._entropy(y) conditional_entropy = self._conditional_entropy(X, y, feature, threshold) return entropy - conditional_entropy # 计算信息增益比 def _gain_ratio(self, X, y, feature, threshold): entropy = self._entropy(y) conditional_entropy = self._conditional_entropy(X, y, feature, threshold) split_info = -np.sum([p * np.log2(p) for p in [np.sum(X[:, feature] <= threshold) / len(X), np.sum(X[:, feature] > threshold) / len(X)]]) return (entropy - conditional_entropy) / split_info if split_info != 0 else 0 # 选择最优特征和划分阈值 def _select_feature_and_threshold(self, X, y): n_features = X.shape[1] max_gain_ratio = -1 best_feature, best_threshold = None, None for feature in range(n_features): thresholds = np.unique(X[:, feature]) for threshold in thresholds: if len(y[X[:, feature] <= threshold]) >= self.min_samples_split and len(y[X[:, feature] > threshold]) >= self.min_samples_split: gain_ratio = self._gain_ratio(X, y, feature, threshold) if gain_ratio > max_gain_ratio: max_gain_ratio = gain_ratio best_feature = feature best_threshold = threshold return best_feature, best_threshold # 构建决策树 def _build_tree(self, X, y): if len(np.unique(y)) == 1: # 叶子节点，返回类别 return Node(target=y[0]) if X.shape[1] == 0: # 叶子节点，返回出现次数最多的类别 target = np.argmax(np.bincount(y)) return Node(target=target) feature, threshold = self._select_feature_and_threshold(X, y) # 选择最优特征和划分阈值 if feature is None or threshold is None: # 叶子节点，返回出现次数最多的类别 target = np.argmax(np.bincount(y)) return Node(target=target) left_indices = X[:, feature] <= threshold right_indices = X[:, feature] > threshold left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树 right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树 return Node(feature=feature, threshold=threshold, left=left, right=right) # 训练决策树 def fit(self, X, y): self.tree = self._build_tree(X, y) # 预测单个样本 def _predict_sample(self, x): node = self.tree while node.target is None: if x[node.feature] <= node.threshold: node = node.left else: node = node.right return node.target # 预测多个样本 def predict(self, X): return np.array([self._predict_sample(x) for x in X]) # 加载鸢尾花数据集 iris = load_iris() X = iris.data y = iris.target # 划分数据集 train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1) # 训练模型 model = C45DecisionTree(min_samples_split=5) model.fit(train_X, train_y) # 预测测试集 pred_y = model.predict(test_X) # 计算准确率 accuracy = np.sum(pred_y == test_y) / len(test_y) print('Accuracy:', accuracy) ```

阅读全文

决策树鸢尾花python

决策树鸢尾花 python 信息熵

id3决策树 鸢尾花 python_C4.5决策树Python代码实现

相关推荐

Python决策树算法鸢尾花分类项目解析

Python实现C4.5决策树鸢尾花分类与可视化

Python实现鸢尾花数据的决策树分类可视化

决策树鸢尾花，python代码.py

Python sklearn决策树鸢尾花数据集代码

Python实现C4.5决策树鸢尾花分类与可视化分析

决策树鸢尾花分类python

python决策树鸢尾花id3算法代码

决策树分类算法python鸢尾花

决策树c4.5python鸢尾花

决策树实现鸢尾花分类python

决策树python鸢尾花

鸢尾花 python C4.5决策树 生成树的图片

鸢尾花决策树分类的Python代码

决策树算法鸢尾花分类python 代码

决策树实现鸢尾花分类Python代码

决策树 鸢尾花 pyrhon

决策树剪枝的 python 代码 鸢尾花数据集

大家在看

定位面研磨-半导体材料

iometer使用指南

基于yoloV4目标检测框架，baidu语音识别，控制西门子1200PLC.zip

千方百剂服务器及客户端安装白皮书

Linux下rsync文件同步详解

最新推荐

智慧园区3D可视化解决方案PPT(24页).pptx

虚拟串口软件：实现IP信号到虚拟串口的转换

【Python进阶篇】：掌握这些高级特性，让你的编程能力飞跃提升

后端调用ragflow api

IE6下实现PNG图片背景透明的技术解决方案

【欧姆龙触摸屏故障诊断全攻略】

Educoder综合练习—C&C++选择结构

VBS简明教程：批处理之家论坛下载指南

【欧姆龙触摸屏：新手必读的10个操作技巧】

阿里云物联网平台不支持新购

id3决策树鸢尾花 python_C4.5决策树Python代码实现

鸢尾花 python C4.5决策树生成树的图片

决策树鸢尾花 pyrhon

决策树剪枝的 python 代码鸢尾花数据集