用python代码实现决策树算法
时间: 2023-07-20 08:36:01 浏览: 98
下面是用Python实现决策树算法的基本代码:
```python
import numpy as np
class DecisionTree:
def __init__(self):
pass
# 计算数据集的信息熵
def calcEntropy(self, y):
unique_y = np.unique(y)
entropy = 0
for i in unique_y:
p = len(y[y == i]) / len(y)
entropy -= p * np.log2(p)
return entropy
# 计算数据集的基尼指数
def calcGini(self, y):
unique_y = np.unique(y)
gini = 1
for i in unique_y:
p = len(y[y == i]) / len(y)
gini -= p ** 2
return gini
# 计算数据集的分类误差率
def calcError(self, y):
unique_y = np.unique(y)
error = 1 - len(y[y == unique_y[0]]) / len(y)
return error
# 根据信息增益选择最优特征
def chooseFeatureByEntropy(self, X, y):
n_features = X.shape[1]
entropy_base = self.calcEntropy(y)
max_gain = 0
best_feature_index = -1
for i in range(n_features):
feature_i = X[:, i]
unique_feature_i = np.unique(feature_i)
entropy_i = 0
for j in unique_feature_i:
y_j = y[feature_i == j]
entropy_i += len(y_j) / len(y) * self.calcEntropy(y_j)
gain_i = entropy_base - entropy_i
if gain_i > max_gain:
max_gain = gain_i
best_feature_index = i
return best_feature_index
# 根据基尼指数选择最优特征
def chooseFeatureByGini(self, X, y):
n_features = X.shape[1]
gini_base = self.calcGini(y)
min_gini = float('inf')
best_feature_index = -1
for i in range(n_features):
feature_i = X[:, i]
unique_feature_i = np.unique(feature_i)
gini_i = 0
for j in unique_feature_i:
y_j = y[feature_i == j]
gini_i += len(y_j) / len(y) * self.calcGini(y_j)
if gini_i < min_gini:
min_gini = gini_i
best_feature_index = i
return best_feature_index
# 根据分类误差率选择最优特征
def chooseFeatureByError(self, X, y):
n_features = X.shape[1]
error_base = self.calcError(y)
min_error = float('inf')
best_feature_index = -1
for i in range(n_features):
feature_i = X[:, i]
unique_feature_i = np.unique(feature_i)
error_i = 0
for j in unique_feature_i:
y_j = y[feature_i == j]
error_i += len(y_j) / len(y) * self.calcError(y_j)
if error_i < min_error:
min_error = error_i
best_feature_index = i
return best_feature_index
# 构建决策树
def buildDecisionTree(self, X, y, feature_choose='entropy'):
unique_y = np.unique(y)
if len(unique_y) == 1:
return unique_y[0]
if X.shape[1] == 0:
return np.argmax(np.bincount(y))
if feature_choose == 'entropy':
best_feature_index = self.chooseFeatureByEntropy(X, y)
elif feature_choose == 'gini':
best_feature_index = self.chooseFeatureByGini(X, y)
else:
best_feature_index = self.chooseFeatureByError(X, y)
best_feature = X[:, best_feature_index]
unique_best_feature = np.unique(best_feature)
decision_tree = {}
for i in unique_best_feature:
X_i = X[best_feature == i]
y_i = y[best_feature == i]
decision_tree[i] = self.buildDecisionTree(X_i, y_i, feature_choose)
return decision_tree
# 预测
def predict(self, decision_tree, X):
if type(decision_tree) is not dict:
return decision_tree
best_feature_index = list(decision_tree.keys())[0]
best_feature = X[best_feature_index]
sub_decision_tree = decision_tree[best_feature]
return self.predict(sub_decision_tree, X)
```
这个决策树算法的实现包括了计算数据集的信息熵、基尼指数和分类误差率的方法,以及根据不同的选择策略(信息增益、基尼指数和分类误差率)选择最优特征的方法。它还包括了递归地构建决策树和预测的方法。
阅读全文