id3决策树 鸢尾花 python_C4.5决策树Python代码实现
时间: 2023-09-17 08:14:55 浏览: 142
id3决策树 鸢尾花 Python代码实现:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature=None, target=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.target = target # 叶子节点的类别
self.left = left # 左子节点
self.right = right # 右子节点
class ID3DecisionTree:
def __init__(self):
self.tree = None # 决策树
# 计算信息熵
def _entropy(self, y):
labels = np.unique(y)
probs = [np.sum(y == label) / len(y) for label in labels]
return -np.sum([p * np.log2(p) for p in probs])
# 计算条件熵
def _conditional_entropy(self, X, y, feature):
feature_values = np.unique(X[:, feature])
probs = [np.sum(X[:, feature] == value) / len(X) for value in feature_values]
entropies = [self._entropy(y[X[:, feature] == value]) for value in feature_values]
return np.sum([p * e for p, e in zip(probs, entropies)])
# 选择最优特征
def _select_feature(self, X, y):
n_features = X.shape[1]
entropies = [self._conditional_entropy(X, y, feature) for feature in range(n_features)]
return np.argmin(entropies)
# 构建决策树
def _build_tree(self, X, y):
if len(np.unique(y)) == 1: # 叶子节点,返回类别
return Node(target=y[0])
if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
feature = self._select_feature(X, y) # 选择最优特征
feature_values = np.unique(X[:, feature])
left_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[0]]
right_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[1]]
left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树
right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树
return Node(feature=feature, left=left, right=right)
# 训练决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
# 预测单个样本
def _predict_sample(self, x):
node = self.tree
while node.target is None:
if x[node.feature] == np.unique(X[:, node.feature])[0]:
node = node.left
else:
node = node.right
return node.target
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# 训练模型
model = ID3DecisionTree()
model.fit(train_X, train_y)
# 预测测试集
pred_y = model.predict(test_X)
# 计算准确率
accuracy = np.sum(pred_y == test_y) / len(test_y)
print('Accuracy:', accuracy)
```
C4.5决策树 Python代码实现:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature=None, threshold=None, target=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.threshold = threshold # 划分数据集的阈值
self.target = target # 叶子节点的类别
self.left = left # 左子节点
self.right = right # 右子节点
class C45DecisionTree:
def __init__(self, min_samples_split=2, min_gain_ratio=1e-4):
self.min_samples_split = min_samples_split # 最小划分样本数
self.min_gain_ratio = min_gain_ratio # 最小增益比
self.tree = None # 决策树
# 计算信息熵
def _entropy(self, y):
labels = np.unique(y)
probs = [np.sum(y == label) / len(y) for label in labels]
return -np.sum([p * np.log2(p) for p in probs])
# 计算条件熵
def _conditional_entropy(self, X, y, feature, threshold):
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
left_probs = np.sum(left_indices) / len(X)
right_probs = np.sum(right_indices) / len(X)
entropies = [self._entropy(y[left_indices]), self._entropy(y[right_indices])]
return np.sum([p * e for p, e in zip([left_probs, right_probs], entropies)])
# 计算信息增益
def _information_gain(self, X, y, feature, threshold):
entropy = self._entropy(y)
conditional_entropy = self._conditional_entropy(X, y, feature, threshold)
return entropy - conditional_entropy
# 计算信息增益比
def _gain_ratio(self, X, y, feature, threshold):
entropy = self._entropy(y)
conditional_entropy = self._conditional_entropy(X, y, feature, threshold)
split_info = -np.sum([p * np.log2(p) for p in [np.sum(X[:, feature] <= threshold) / len(X), np.sum(X[:, feature] > threshold) / len(X)]])
return (entropy - conditional_entropy) / split_info if split_info != 0 else 0
# 选择最优特征和划分阈值
def _select_feature_and_threshold(self, X, y):
n_features = X.shape[1]
max_gain_ratio = -1
best_feature, best_threshold = None, None
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
if len(y[X[:, feature] <= threshold]) >= self.min_samples_split and len(y[X[:, feature] > threshold]) >= self.min_samples_split:
gain_ratio = self._gain_ratio(X, y, feature, threshold)
if gain_ratio > max_gain_ratio:
max_gain_ratio = gain_ratio
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
# 构建决策树
def _build_tree(self, X, y):
if len(np.unique(y)) == 1: # 叶子节点,返回类别
return Node(target=y[0])
if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
feature, threshold = self._select_feature_and_threshold(X, y) # 选择最优特征和划分阈值
if feature is None or threshold is None: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树
right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树
return Node(feature=feature, threshold=threshold, left=left, right=right)
# 训练决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
# 预测单个样本
def _predict_sample(self, x):
node = self.tree
while node.target is None:
if x[node.feature] <= node.threshold:
node = node.left
else:
node = node.right
return node.target
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# 训练模型
model = C45DecisionTree(min_samples_split=5)
model.fit(train_X, train_y)
# 预测测试集
pred_y = model.predict(test_X)
# 计算准确率
accuracy = np.sum(pred_y == test_y) / len(test_y)
print('Accuracy:', accuracy)
```
阅读全文