请用python实现模糊决策树
时间: 2023-08-28 19:06:03 浏览: 254
基于决策树方法的数据挖掘分析.pdf
以下是Python实现模糊决策树的示例代码:
```python
import numpy as np
# 定义模糊集合的隶属度函数
def triangular_membership(x, a, b, c):
if x <= a or x >= c:
return 0
elif a < x <= b:
return (x - a) / (b - a)
else:
return (c - x) / (c - b)
# 定义节点类
class Node:
def __init__(self, feature_index=None, threshold=None, operator=None, left=None, right=None, label=None):
self.feature_index = feature_index # 节点划分特征的索引
self.threshold = threshold # 节点划分阈值
self.operator = operator # 节点划分运算符
self.left = left # 左子节点
self.right = right # 右子节点
self.label = label # 叶子节点标签
# 定义模糊决策树类
class FuzzyDecisionTree:
def __init__(self, max_depth=5, min_samples_split=2, min_impurity_decrease=0):
self.max_depth = max_depth # 最大深度
self.min_samples_split = min_samples_split # 最小样本分割数
self.min_impurity_decrease = min_impurity_decrease # 最小不纯度减少量
self.tree = None # 模糊决策树
# 计算基尼指数(用于节点不纯度的衡量)
def _gini_index(self, y):
classes = np.unique(y)
n_samples = len(y)
gini = 1
for c in classes:
p_c = len(y[y == c]) / n_samples
gini -= p_c ** 2
return gini
# 计算信息熵(用于节点不纯度的衡量)
def _entropy(self, y):
classes = np.unique(y)
n_samples = len(y)
entropy = 0
for c in classes:
p_c = len(y[y == c]) / n_samples
entropy -= p_c * np.log2(p_c)
return entropy
# 寻找最优划分特征和阈值
def _find_best_split(self, X, y):
n_features = X.shape[1]
best_feature_index, best_threshold, best_operator, best_impurity = None, None, None, 1
for feature_index in range(n_features):
fuzzy_set1 = {'low': [], 'medium': [], 'high': []}
fuzzy_set2 = {'low': [], 'medium': [], 'high': []}
for i in range(len(X)):
if X[i][feature_index] <= np.percentile(X[:, feature_index], 33):
fuzzy_set1['low'].append(y[i])
elif X[i][feature_index] >= np.percentile(X[:, feature_index], 67):
fuzzy_set1['high'].append(y[i])
else:
fuzzy_set1['medium'].append(y[i])
for i in range(len(X)):
fuzzy_set2['low'].append(triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 0), np.percentile(X[:, feature_index], 33), np.percentile(X[:, feature_index], 67)))
fuzzy_set2['medium'].append(triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 33), np.percentile(X[:, feature_index], 67), np.percentile(X[:, feature_index], 100)))
fuzzy_set2['high'].append(triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 67), np.percentile(X[:, feature_index], 100), np.percentile(X[:, feature_index], 101)))
impurity = 0
for operator in ('<=', '>'):
if operator == '<=':
fuzzy_set = fuzzy_set1
else:
fuzzy_set = fuzzy_set2
if len(fuzzy_set['low']) > 0 and len(fuzzy_set['medium']) > 0:
impurity += fuzzy_set['low'].count(0) + fuzzy_set['medium'].count(1) - (fuzzy_set['low'].count(0) / len(fuzzy_set['low'])) ** 2 - (fuzzy_set['medium'].count(1) / len(fuzzy_set['medium'])) ** 2
if len(fuzzy_set['medium']) > 0 and len(fuzzy_set['high']) > 0:
impurity += fuzzy_set['medium'].count(0) + fuzzy_set['high'].count(1) - (fuzzy_set['medium'].count(0) / len(fuzzy_set['medium'])) ** 2 - (fuzzy_set['high'].count(1) / len(fuzzy_set['high'])) ** 2
if impurity < best_impurity:
best_feature_index, best_threshold, best_operator, best_impurity = feature_index, np.percentile(X[:, feature_index], 50), '<=', impurity
for operator in ('>',):
if operator == '<=':
fuzzy_set = fuzzy_set1
else:
fuzzy_set = fuzzy_set2
if len(fuzzy_set['low']) > 0 and len(fuzzy_set['medium']) > 0:
impurity += fuzzy_set['low'].count(1) + fuzzy_set['medium'].count(0) - (fuzzy_set['low'].count(1) / len(fuzzy_set['low'])) ** 2 - (fuzzy_set['medium'].count(0) / len(fuzzy_set['medium'])) ** 2
if len(fuzzy_set['medium']) > 0 and len(fuzzy_set['high']) > 0:
impurity += fuzzy_set['medium'].count(1) + fuzzy_set['high'].count(0) - (fuzzy_set['medium'].count(1) / len(fuzzy_set['medium'])) ** 2 - (fuzzy_set['high'].count(0) / len(fuzzy_set['high'])) ** 2
if impurity < best_impurity:
best_feature_index, best_threshold, best_operator, best_impurity = feature_index, np.percentile(X[:, feature_index], 50), '>', impurity
return best_feature_index, best_threshold, best_operator, best_impurity
# 创建模糊决策树
def _create_tree(self, X, y, depth):
n_samples, n_features = X.shape
# 如果样本数小于最小样本分割数或树的深度大于等于最大深度,返回叶子节点,并以样本标签最多的类别作为标签
if n_samples < self.min_samples_split or depth >= self.max_depth:
return Node(label=np.argmax(np.bincount(y)))
# 计算节点的不纯度
impurity = self._gini_index(y) if self.min_impurity_decrease <= 0 else self._entropy(y)
# 如果节点的不纯度小于等于最小不纯度减少量,返回叶子节点,并以样本标签最多的类别作为标签
if impurity <= self.min_impurity_decrease:
return Node(label=np.argmax(np.bincount(y)))
# 寻找最优划分特征和阈值
feature_index, threshold, operator, impurity = self._find_best_split(X, y)
# 如果找不到合适的划分特征和阈值,返回叶子节点,并以样本标签最多的类别作为标签
if feature_index is None or threshold is None:
return Node(label=np.argmax(np.bincount(y)))
# 根据划分特征和阈值拆分数据集并递归生成子树
if operator == '<=':
left_indices = [i for i in range(n_samples) if triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 0), np.percentile(X[:, feature_index], 33), np.percentile(X[:, feature_index], 67)) > threshold]
right_indices = [i for i in range(n_samples) if triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 0), np.percentile(X[:, feature_index], 33), np.percentile(X[:, feature_index], 67)) <= threshold]
else:
left_indices = [i for i in range(n_samples) if triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 67), np.percentile(X[:, feature_index], 100), np.percentile(X[:, feature_index], 101)) > threshold]
right_indices = [i for i in range(n_samples) if triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 67), np.percentile(X[:, feature_index], 100), np.percentile(X[:, feature_index], 101)) <= threshold]
left = self._create_tree(X[left_indices], y[left_indices], depth + 1)
right = self._create_tree(X[right_indices], y[right_indices], depth + 1)
return Node(feature_index=feature_index, threshold=threshold, operator=operator, left=left, right=right)
# 拟合模型
def fit(self, X, y):
self.tree = self._create_tree(X, y, 0)
# 预测样本标签
def predict(self, X):
n_samples = X.shape[0]
y_pred = []
for i in range(n_samples):
node = self.tree
while node.label is None:
if node.operator == '<=':
if triangular_membership(X[i][node.feature_index], np.percentile(X[:, node.feature_index], 0), np.percentile(X[:, node.feature_index], 33), np.percentile(X[:, node.feature_index], 67)) > node.threshold:
node = node.left
else:
node = node.right
else:
if triangular_membership(X[i][node.feature_index], np.percentile(X[:, node.feature_index], 67), np.percentile(X[:, node.feature_index], 100), np.percentile(X[:, node.feature_index], 101)) > node.threshold:
node = node.left
else:
node = node.right
y_pred.append(node.label)
return y_pred
```
需要注意的是,由于模糊决策树的划分方式不同于传统决策树,因此需要重新定义节点类和创建模糊决策树的方法。在示例代码中,我们使用三角隶属度函数来定义模糊集合的隶属度函数,并使用三等分法将特征值划分成三个模糊集合。在寻找最优划分特征和阈值时,我们需要对每个特征的每个模糊集合计算不纯度,并选择不纯度最小的划分方式作为最优划分方式。在拆分数据集时,我们需要根据划分特征和阈值将每个样本划分到相应的模糊集合中,并根据模糊集合的隶属度计算每个样本在左子节点和右子节点中的隶属度,以此来决定每个样本应该属于哪个子节点。在预测样本标签时,我们需要在树中递归查找每个样本所属的叶子节点,并将该节点的标签作为预测结果。
阅读全文