用python实现DecisionTreeClassifier但不能使用sklearn
时间: 2024-04-07 17:09:40 浏览: 50
python使用sklearn实现决策树的方法示例
好的,下面是一个简单的实现:
首先定义一个节点类,表示每个决策树的节点:
```python
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, label=None):
self.feature = feature
self.threshold = threshold
self.left = left
self.right = right
self.label = label
```
然后定义一个决策树分类器类:
```python
class DecisionTreeClassifier:
def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.root = None
def fit(self, X, y):
self.root = self.build_tree(X, y)
def build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_labels = len(np.unique(y))
# 停止递归条件
if depth == self.max_depth or n_samples < self.min_samples_split or n_labels == 1:
label = self.get_label(y)
return Node(label=label)
# 寻找最佳划分特征和阈值
best_feature, best_threshold = self.get_best_split(X, y, n_samples, n_features)
# 停止递归条件
if best_feature is None or best_threshold is None:
label = self.get_label(y)
return Node(label=label)
# 递归构建左子树和右子树
left_idxs = X[:, best_feature] < best_threshold
right_idxs = X[:, best_feature] >= best_threshold
left = self.build_tree(X[left_idxs], y[left_idxs], depth+1)
right = self.build_tree(X[right_idxs], y[right_idxs], depth+1)
return Node(best_feature, best_threshold, left, right)
def get_best_split(self, X, y, n_samples, n_features):
best_gini = float('inf')
best_feature = None
best_threshold = None
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
gini = self.get_gini(X, y, feature, threshold, n_samples)
if gini < best_gini:
best_gini = gini
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def get_gini(self, X, y, feature, threshold, n_samples):
left_idxs = X[:, feature] < threshold
right_idxs = X[:, feature] >= threshold
n_left = np.sum(left_idxs)
n_right = np.sum(right_idxs)
gini_left = 0
if n_left > 0:
_, counts = np.unique(y[left_idxs], return_counts=True)
p = counts / n_left
gini_left = 1 - np.sum(p ** 2)
gini_right = 0
if n_right > 0:
_, counts = np.unique(y[right_idxs], return_counts=True)
p = counts / n_right
gini_right = 1 - np.sum(p ** 2)
gini = (n_left / n_samples) * gini_left + (n_right / n_samples) * gini_right
return gini
def get_label(self, y):
_, counts = np.unique(y, return_counts=True)
label = np.argmax(counts)
return label
def predict(self, X):
y_pred = []
for sample in X:
node = self.root
while node.left:
if sample[node.feature] < node.threshold:
node = node.left
else:
node = node.right
y_pred.append(node.label)
return y_pred
```
其中,`fit()`方法用于训练决策树,`predict()`方法用于对新样本进行预测。
我们还需要编写一个测试程序:
```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")
```
以上就是一个简单的决策树分类器的实现,可以使用iris数据集进行测试。
阅读全文