编写代码使用决策树算法进行心脏病预测,构建模型、调优、模型评估,可视化展示,不使用sklearn库
时间: 2024-02-03 16:04:22 浏览: 70
决策树算法代码
好的,下面是使用决策树算法进行心脏病预测的Python代码,不使用sklearn库:
```python
import math
# 定义节点类
class Node:
def __init__(self, feature=None, value=None, result=None, left=None, right=None):
self.feature = feature # 分裂特征
self.value = value # 分裂特征的值
self.result = result # 叶子节点的预测结果
self.left = left # 左子树
self.right = right # 右子树
# 计算基尼指数
def gini(y):
n = len(y)
if n == 0:
return 0
counts = {}
for label in y:
counts[label] = counts.get(label, 0) + 1
impurity = 1
for label in counts:
prob = counts[label] / n
impurity -= prob ** 2
return impurity
# 计算信息增益
def gain(X, y, feature, value):
left_X, left_y, right_X, right_y = split(X, y, feature, value)
p = len(left_X) / len(X)
return gini(y) - p * gini(left_y) - (1 - p) * gini(right_y)
# 根据特征和特征值分裂数据集
def split(X, y, feature, value):
left_X, left_y, right_X, right_y = [], [], [], []
for i in range(len(X)):
if X[i][feature] < value:
left_X.append(X[i])
left_y.append(y[i])
else:
right_X.append(X[i])
right_y.append(y[i])
return left_X, left_y, right_X, right_y
# 选择最佳分裂特征和特征值
def choose_best_feature_and_value(X, y):
best_feature, best_value, best_gain = None, None, -math.inf
for feature in range(len(X[0])):
values = set([X[i][feature] for i in range(len(X))])
for value in values:
current_gain = gain(X, y, feature, value)
if current_gain > best_gain:
best_gain = current_gain
best_feature = feature
best_value = value
return best_feature, best_value
# 构建决策树
def build_tree(X, y, max_depth):
if max_depth == 0 or len(set(y)) == 1:
return Node(result=y[0])
best_feature, best_value = choose_best_feature_and_value(X, y)
left_X, left_y, right_X, right_y = split(X, y, best_feature, best_value)
left = build_tree(left_X, left_y, max_depth - 1)
right = build_tree(right_X, right_y, max_depth - 1)
return Node(feature=best_feature, value=best_value, left=left, right=right)
# 对样本进行预测
def predict_one(tree, x):
if tree.result is not None:
return tree.result
if x[tree.feature] < tree.value:
return predict_one(tree.left, x)
else:
return predict_one(tree.right, x)
# 对数据集进行预测
def predict(tree, X):
return [predict_one(tree, x) for x in X]
# 计算准确率
def accuracy(y_true, y_pred):
n = len(y_true)
correct = sum([y_true[i] == y_pred[i] for i in range(n)])
return correct / n
# 可视化决策树
def print_tree(tree, indent=0):
if tree.result is not None:
print(str(tree.result))
else:
print(str(tree.feature) + ':' + str(tree.value) + '? ')
print(' ' * (indent + 1) + 'T->', end='')
print_tree(tree.left, indent+1)
print(' ' * (indent + 1) + 'F->', end='')
print_tree(tree.right, indent+1)
# 加载数据集
def load_data():
X = []
y = []
with open('heart.csv', 'r') as f:
lines = f.readlines()
for line in lines[1:]:
data = line.strip().split(',')
X.append([float(x) for x in data[:-1]])
y.append(int(data[-1]))
return X, y
# 划分数据集为训练集和测试集
def train_test_split(X, y, test_ratio=0.2):
n = len(X)
test_size = int(n * test_ratio)
shuffle_indexes = np.random.permutation(n)
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]
X_train = [X[i] for i in train_indexes]
y_train = [y[i] for i in train_indexes]
X_test = [X[i] for i in test_indexes]
y_test = [y[i] for i in test_indexes]
return X_train, y_train, X_test, y_test
# 构建决策树模型
def decision_tree(X_train, y_train, max_depth):
tree = build_tree(X_train, y_train, max_depth)
return tree
# 模型评估
def evaluate_model(model, X_test, y_test):
y_pred = predict(model, X_test)
acc = accuracy(y_test, y_pred)
return acc
# 主函数
if __name__ == '__main__':
X, y = load_data()
X_train, y_train, X_test, y_test = train_test_split(X, y, test_ratio=0.2)
tree = decision_tree(X_train, y_train, max_depth=5)
print_tree(tree)
acc = evaluate_model(tree, X_test, y_test)
print('Accuracy on test set: {:.2f}%'.format(acc * 100))
```
在这个例子中,我们使用了CART算法来构建决策树模型,并使用基尼指数来计算特征的重要性。为了防止过拟合,我们设置了一个最大深度。我们还实现了预测函数和计算准确率的函数,并使用随机划分方法将数据集划分为训练集和测试集。最后,我们使用构建的模型对测试集进行预测并评估模型的准确率。
阅读全文