决策树分类python代码_分类算法-决策树 Decision Tree
时间: 2023-11-27 11:03:27 浏览: 65
以下是一个简单的决策树分类的Python代码示例:
```
from sklearn import tree
# 定义特征和标签
features = [[140, 1], [130, 1], [150, 0], [170, 0]]
labels = [0, 0, 1, 1]
# 训练决策树模型
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features, labels)
# 预测新数据
print(clf.predict([[160, 0]]))
```
这个代码示例中,我们定义了一个由两个特征组成的数据集和对应的标签。然后使用Scikit-learn库中的DecisionTreeClassifier类来训练决策树模型,并用训练好的模型预测新数据。在这个例子中,我们预测一个重量为160克、颜色为红的水果是什么类型。输出结果为1,表示这是一个橙子。
相关问题
python决策树算法代码_决策树的Python实现(含代码)
下面是一份使用Python实现决策树算法的代码:
```python
import numpy as np
def entropy(y):
"""计算熵"""
_, counts = np.unique(y, return_counts=True)
p = counts / len(y)
return -np.sum(p * np.log2(p))
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
def fit(self, X, y):
self.n_features_ = X.shape[1]
self.tree_ = self._grow_tree(X, y)
def predict(self, X):
return [self._predict(inputs) for inputs in X]
def _best_split(self, X, y):
"""找到最好的特征和分割点"""
m = y.size
if m <= 1:
return None, None
num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
best_idx, best_thr = None, None
for idx in range(self.n_features_):
thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
num_left = [0] * self.n_classes_
num_right = num_parent.copy()
for i in range(1, m):
c = classes[i - 1]
num_left[c] += 1
num_right[c] -= 1
gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in range(self.n_classes_))
gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_))
gini = (i * gini_left + (m - i) * gini_right) / m
if thresholds[i] == thresholds[i - 1]:
continue
if gini < best_gini:
best_gini = gini
best_idx = idx
best_thr = (thresholds[i] + thresholds[i - 1]) / 2
return best_idx, best_thr
def _grow_tree(self, X, y, depth=0):
"""递归地构建决策树"""
num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
predicted_class = np.argmax(num_samples_per_class)
node = Node(
predicted_class=predicted_class,
num_samples=len(y),
num_samples_per_class=num_samples_per_class,
)
# 停止递归条件
if (
depth < self.max_depth
and np.unique(y).size > 1
and X.shape[0] > self.min_samples_split
):
idx, thr = self._best_split(X, y)
if idx is not None:
indices_left = X[:, idx] < thr
X_left, y_left = X[indices_left], y[indices_left]
X_right, y_right = X[~indices_left], y[~indices_left]
node.feature_index = idx
node.threshold = thr
node.left = self._grow_tree(X_left, y_left, depth + 1)
node.right = self._grow_tree(X_right, y_right, depth + 1)
return node
def _predict(self, inputs):
"""预测一个样本"""
node = self.tree_
while node.left:
if inputs[node.feature_index] < node.threshold:
node = node.left
else:
node = node.right
return node.predicted_class
class Node:
def __init__(self, *, predicted_class, num_samples, num_samples_per_class):
self.predicted_class = predicted_class
self.num_samples = num_samples
self.num_samples_per_class = num_samples_per_class
self.feature_index = 0
self.threshold = 0
self.left = None
self.right = None
```
这份代码实现了基于 Gini 系数的分类决策树算法。其中 `max_depth` 参数表示树的最大深度,`min_samples_split` 参数表示一个节点至少需要包含多少个样本才能进行分裂。使用时,可以按照下面的方式实例化并调用:
```python
clf = DecisionTree(max_depth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
```
其中 `X_train` 和 `y_train` 分别表示训练集的特征和标签,`X_test` 表示测试集的特征。
id3决策树 鸢尾花 python_C4.5决策树Python代码实现
id3决策树 鸢尾花 Python代码实现:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature=None, target=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.target = target # 叶子节点的类别
self.left = left # 左子节点
self.right = right # 右子节点
class ID3DecisionTree:
def __init__(self):
self.tree = None # 决策树
# 计算信息熵
def _entropy(self, y):
labels = np.unique(y)
probs = [np.sum(y == label) / len(y) for label in labels]
return -np.sum([p * np.log2(p) for p in probs])
# 计算条件熵
def _conditional_entropy(self, X, y, feature):
feature_values = np.unique(X[:, feature])
probs = [np.sum(X[:, feature] == value) / len(X) for value in feature_values]
entropies = [self._entropy(y[X[:, feature] == value]) for value in feature_values]
return np.sum([p * e for p, e in zip(probs, entropies)])
# 选择最优特征
def _select_feature(self, X, y):
n_features = X.shape[1]
entropies = [self._conditional_entropy(X, y, feature) for feature in range(n_features)]
return np.argmin(entropies)
# 构建决策树
def _build_tree(self, X, y):
if len(np.unique(y)) == 1: # 叶子节点,返回类别
return Node(target=y[0])
if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
feature = self._select_feature(X, y) # 选择最优特征
feature_values = np.unique(X[:, feature])
left_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[0]]
right_indices = [i for i in range(len(X)) if X[i][feature] == feature_values[1]]
left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树
right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树
return Node(feature=feature, left=left, right=right)
# 训练决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
# 预测单个样本
def _predict_sample(self, x):
node = self.tree
while node.target is None:
if x[node.feature] == np.unique(X[:, node.feature])[0]:
node = node.left
else:
node = node.right
return node.target
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# 训练模型
model = ID3DecisionTree()
model.fit(train_X, train_y)
# 预测测试集
pred_y = model.predict(test_X)
# 计算准确率
accuracy = np.sum(pred_y == test_y) / len(test_y)
print('Accuracy:', accuracy)
```
C4.5决策树 Python代码实现:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature=None, threshold=None, target=None, left=None, right=None):
self.feature = feature # 划分数据集的特征
self.threshold = threshold # 划分数据集的阈值
self.target = target # 叶子节点的类别
self.left = left # 左子节点
self.right = right # 右子节点
class C45DecisionTree:
def __init__(self, min_samples_split=2, min_gain_ratio=1e-4):
self.min_samples_split = min_samples_split # 最小划分样本数
self.min_gain_ratio = min_gain_ratio # 最小增益比
self.tree = None # 决策树
# 计算信息熵
def _entropy(self, y):
labels = np.unique(y)
probs = [np.sum(y == label) / len(y) for label in labels]
return -np.sum([p * np.log2(p) for p in probs])
# 计算条件熵
def _conditional_entropy(self, X, y, feature, threshold):
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
left_probs = np.sum(left_indices) / len(X)
right_probs = np.sum(right_indices) / len(X)
entropies = [self._entropy(y[left_indices]), self._entropy(y[right_indices])]
return np.sum([p * e for p, e in zip([left_probs, right_probs], entropies)])
# 计算信息增益
def _information_gain(self, X, y, feature, threshold):
entropy = self._entropy(y)
conditional_entropy = self._conditional_entropy(X, y, feature, threshold)
return entropy - conditional_entropy
# 计算信息增益比
def _gain_ratio(self, X, y, feature, threshold):
entropy = self._entropy(y)
conditional_entropy = self._conditional_entropy(X, y, feature, threshold)
split_info = -np.sum([p * np.log2(p) for p in [np.sum(X[:, feature] <= threshold) / len(X), np.sum(X[:, feature] > threshold) / len(X)]])
return (entropy - conditional_entropy) / split_info if split_info != 0 else 0
# 选择最优特征和划分阈值
def _select_feature_and_threshold(self, X, y):
n_features = X.shape[1]
max_gain_ratio = -1
best_feature, best_threshold = None, None
for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
if len(y[X[:, feature] <= threshold]) >= self.min_samples_split and len(y[X[:, feature] > threshold]) >= self.min_samples_split:
gain_ratio = self._gain_ratio(X, y, feature, threshold)
if gain_ratio > max_gain_ratio:
max_gain_ratio = gain_ratio
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
# 构建决策树
def _build_tree(self, X, y):
if len(np.unique(y)) == 1: # 叶子节点,返回类别
return Node(target=y[0])
if X.shape[1] == 0: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
feature, threshold = self._select_feature_and_threshold(X, y) # 选择最优特征和划分阈值
if feature is None or threshold is None: # 叶子节点,返回出现次数最多的类别
target = np.argmax(np.bincount(y))
return Node(target=target)
left_indices = X[:, feature] <= threshold
right_indices = X[:, feature] > threshold
left = self._build_tree(X[left_indices], y[left_indices]) # 递归构建左子树
right = self._build_tree(X[right_indices], y[right_indices]) # 递归构建右子树
return Node(feature=feature, threshold=threshold, left=left, right=right)
# 训练决策树
def fit(self, X, y):
self.tree = self._build_tree(X, y)
# 预测单个样本
def _predict_sample(self, x):
node = self.tree
while node.target is None:
if x[node.feature] <= node.threshold:
node = node.left
else:
node = node.right
return node.target
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# 训练模型
model = C45DecisionTree(min_samples_split=5)
model.fit(train_X, train_y)
# 预测测试集
pred_y = model.predict(test_X)
# 计算准确率
accuracy = np.sum(pred_y == test_y) / len(test_y)
print('Accuracy:', accuracy)
```
相关推荐
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)