决策树鸢尾花python
时间: 2023-11-08 11:05:14 浏览: 167
决策树是一种基本的分类和回归方法,它通过对数据进行分割来构建一棵树形结构,每个叶子节点代表一个类别或者一个数值。鸢尾花数据集是一个经典的分类问题,它包含了三种不同种类的鸢尾花,每种鸢尾花有四个特征:花萼长度、花萼宽度、花瓣长度和花瓣宽度。在Python中,我们可以使用scikit-learn库来构建决策树模型并对鸢尾花数据集进行分类。
具体来说,我们可以使用sklearn.tree模块中的DecisionTreeClassifier类来构建决策树模型。首先,我们需要将鸢尾花数据集分为训练集和测试集,然后使用fit()方法来拟合模型,最后使用predict()方法来对测试集进行预测并计算准确率。
需要注意的是,在构建决策树模型时,我们需要考虑连续值和缺失值的处理。对于连续值,我们可以使用二分法或者信息增益来进行处理;对于缺失值,我们可以使用均值、中位数或者众数来进行填充。
相关问题
id3决策树 鸢尾花 python_C4.5决策树Python代码实现
id3决策树 鸢尾花 Python代码实现:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature=None, target=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.target = target # 叶子节点的类别
self.left = left # 左子节点
self.right = right # 右子节点
class ID3DecisionTree:
def __init__(self):
self.tree = None # 决策树
# 计算信息熵
def _entropy(self, y):
labels = np.unique(y)
probs = [np.sum(y == label) / len(y) for label in labels]
return -np.sum([p * np.log2(p) for p in probs])
# 计算条件熵
def _conditional_entropy(self, X, y, feature):
feature_values = np.unique(X[:, feature])
probs = [np.sum(X[:, feature] == value) / len(X) for value in feature_values]
entropies = [self._entropy(y[X[:, feature] == value]) for value in feature_values]
return np.sum([p * e for p, e in zip(probs, entropies)])
# 选择最优特征
def _select_feature(self, X, y):
n_features = X.shape[1]
entropies = [self._conditional_entropy(X, y, feature) for feature in range(n_features)]
return np.argmin(entropies)
# 构建决策树
def _build_tree(self, X, y):
if len(np.unique(y)) == 1: # 叶子节点,返回类别
return Node(target=y[0])
if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
feature = self._select_feature(X, y) # 选择最优特征
feature_values = np.unique(X[:, feature])
left_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[0]]
right_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[1]]
left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树
right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树
return Node(feature=feature, left=left, right=right)
# 训练决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
# 预测单个样本
def _predict_sample(self, x):
node = self.tree
while node.target is None:
if x[node.feature] == np.unique(X[:, node.feature])[0]:
node = node.left
else:
node = node.right
return node.target
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# 训练模型
model = ID3DecisionTree()
model.fit(train_X, train_y)
# 预测测试集
pred_y = model.predict(test_X)
# 计算准确率
accuracy = np.sum(pred_y == test_y) / len(test_y)
print('Accuracy:', accuracy)
```
C4.5决策树 Python代码实现:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature=None, threshold=None, target=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.threshold = threshold # 划分数据集的阈值
self.target = target # 叶子节点的类别
self.left = left # 左子节点
self.right = right # 右子节点
class C45DecisionTree:
def __init__(self, min_samples_split=2, min_gain_ratio=1e-4):
self.min_samples_split = min_samples_split # 最小划分样本数
self.min_gain_ratio = min_gain_ratio # 最小增益比
self.tree = None # 决策树
# 计算信息熵
def _entropy(self, y):
labels = np.unique(y)
probs = [np.sum(y == label) / len(y) for label in labels]
return -np.sum([p * np.log2(p) for p in probs])
# 计算条件熵
def _conditional_entropy(self, X, y, feature, threshold):
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
left_probs = np.sum(left_indices) / len(X)
right_probs = np.sum(right_indices) / len(X)
entropies = [self._entropy(y[left_indices]), self._entropy(y[right_indices])]
return np.sum([p * e for p, e in zip([left_probs, right_probs], entropies)])
# 计算信息增益
def _information_gain(self, X, y, feature, threshold):
entropy = self._entropy(y)
conditional_entropy = self._conditional_entropy(X, y, feature, threshold)
return entropy - conditional_entropy
# 计算信息增益比
def _gain_ratio(self, X, y, feature, threshold):
entropy = self._entropy(y)
conditional_entropy = self._conditional_entropy(X, y, feature, threshold)
split_info = -np.sum([p * np.log2(p) for p in [np.sum(X[:, feature] <= threshold) / len(X), np.sum(X[:, feature] > threshold) / len(X)]])
return (entropy - conditional_entropy) / split_info if split_info != 0 else 0
# 选择最优特征和划分阈值
def _select_feature_and_threshold(self, X, y):
n_features = X.shape[1]
max_gain_ratio = -1
best_feature, best_threshold = None, None
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
if len(y[X[:, feature] <= threshold]) >= self.min_samples_split and len(y[X[:, feature] > threshold]) >= self.min_samples_split:
gain_ratio = self._gain_ratio(X, y, feature, threshold)
if gain_ratio > max_gain_ratio:
max_gain_ratio = gain_ratio
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
# 构建决策树
def _build_tree(self, X, y):
if len(np.unique(y)) == 1: # 叶子节点,返回类别
return Node(target=y[0])
if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
feature, threshold = self._select_feature_and_threshold(X, y) # 选择最优特征和划分阈值
if feature is None or threshold is None: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树
right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树
return Node(feature=feature, threshold=threshold, left=left, right=right)
# 训练决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
# 预测单个样本
def _predict_sample(self, x):
node = self.tree
while node.target is None:
if x[node.feature] <= node.threshold:
node = node.left
else:
node = node.right
return node.target
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# 训练模型
model = C45DecisionTree(min_samples_split=5)
model.fit(train_X, train_y)
# 预测测试集
pred_y = model.predict(test_X)
# 计算准确率
accuracy = np.sum(pred_y == test_y) / len(test_y)
print('Accuracy:', accuracy)
```
决策树鸢尾花分类python
决策树是一种常用的监督学习方法,它可以用于分类和回归任务。在分类任务中,决策树通过学习一个树形结构来对样本进行分类。在Python中,可以使用scikit-learn库的DecisionTreeClassifier类来构建决策树模型进行鸢尾花分类。
首先,你需要导入必要的库和数据集:
```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
```
接下来,你可以创建一个决策树分类器并训练模型:
```python
# 创建决策树分类器
clf = DecisionTreeClassifier()
# 训练模型
clf.fit(X_train, y_train)
```
模型训练完成后,你可以使用测试集来评估模型的性能:
```python
# 在测试集上进行预测
y_pred = clf.predict(X_test)
# 计算准确率
accuracy = clf.score(X_test, y_test)
print("准确率:", accuracy)
```
以上是使用决策树进行鸢尾花分类的Python代码。你可以根据自己的需求进行调整和扩展。
阅读全文