给出python实现决策树算法的代码,不调用sklearn库,对鸢尾花数据集分类,实现结果可视化
时间: 2024-04-29 13:27:06 浏览: 176
以下是Python实现决策树算法的代码,不调用sklearn库,对鸢尾花数据集进行分类并实现结果可视化:
```
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
class Node:
def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
self.feature_idx = feature_idx
self.threshold = threshold
self.left = left
self.right = right
self.value = value
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
self.root = None
def fit(self, X, y):
self.n_classes_ = len(set(y))
self.n_features_ = X.shape[1]
self.root = self._grow_tree(X, y)
def predict(self, X):
return [self._traverse_tree(x, self.root) for x in X]
def _grow_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_labels = len(set(y))
if (depth is None) or (depth == self.max_depth) or (n_labels == 1) or (n_samples < 2):
if n_samples == 0:
return None
leaf_value = self._leaf_value(y)
return Node(value=leaf_value)
feature_idxs = np.random.choice(n_features, int(np.sqrt(n_features)), replace=False)
best_feature_idx, best_threshold = self._best_criteria(X, y, feature_idxs)
left_idxs, right_idxs = self._split(X[:, best_feature_idx], best_threshold)
left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
return Node(best_feature_idx, best_threshold, left, right)
def _best_criteria(self, X, y, feature_idxs):
best_gain = -1
split_idx, split_threshold = None, None
for feature_idx in feature_idxs:
X_column = X[:, feature_idx]
thresholds = np.unique(X_column)
for threshold in thresholds:
gain = self._information_gain(y, X_column, threshold)
if gain > best_gain:
best_gain = gain
split_idx = feature_idx
split_threshold = threshold
return split_idx, split_threshold
def _split(self, X_column, split_threshold):
left_idxs = np.argwhere(X_column <= split_threshold).flatten()
right_idxs = np.argwhere(X_column > split_threshold).flatten()
return left_idxs, right_idxs
def _information_gain(self, y, X_column, split_threshold):
parent_entropy = self._entropy(y)
left_idxs, right_idxs = self._split(X_column, split_threshold)
if len(left_idxs) == 0 or len(right_idxs) == 0:
return 0
n = len(y)
n_l, n_r = len(left_idxs), len(right_idxs)
e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
ig = parent_entropy - child_entropy
return ig
def _leaf_value(self, y):
if self.n_classes_ == 1:
return y[0]
else:
bincount = np.bincount(y, minlength=self.n_classes_)
probs = bincount / len(y)
idx = np.argmax(probs)
return idx
def _entropy(self, y):
hist = np.bincount(y, minlength=self.n_classes_)
ps = hist / np.sum(hist)
return -np.sum([p * np.log2(p) for p in ps if p > 0])
def _traverse_tree(self, x, node):
if node.value is not None:
return node.value
if x[node.feature_idx] <= node.threshold:
return self._traverse_tree(x, node.left)
else:
return self._traverse_tree(x, node.right)
def plot_decision_boundary(clf, X, y, plot_step = 0.02, cmap = "Paired"):
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize = (10,6))
plt.contourf(xx, yy, Z , cmap=cmap, alpha= 0.5)
plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap, edgecolor = "black")
if __name__ == "__main__":
iris = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None).values
np.random.seed(1)
np.random.shuffle(iris)
X_train = iris[:100 , [0,2]]
y_train = iris[:100 , -1]
y_train[y_train == 'Iris-setosa'] = 0
y_train[y_train == 'Iris-versicolor'] = 1
y_train[y_train == 'Iris-virginica'] = 2
y_train = y_train.astype(int)
dt = DecisionTree(max_depth=3)
dt.fit(X_train, y_train)
plot_decision_boundary(dt, X_train, y_train)
plt.show()
```
注意:此处为Jupyter Notebook版本的代码,直接在Jupyter中运行即可,否则可能需要对plt.show()进行更改。
阅读全文