不使用sklearn库通过编写决策树算法进行心脏病预测包括数据处理,算法建模,算法调优,模型评估,可视化 代码
时间: 2024-01-31 21:02:00 浏览: 75
好的,下面是一个基于Python的决策树算法实现心脏病预测的示例代码,包括数据处理、算法建模、算法调优、模型评估和可视化:
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 加载数据
data = pd.read_csv('heart.csv')
# 数据预处理
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
# 将类别变量转换为数值型
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
X[:, 1] = labelencoder.fit_transform(X[:, 1])
X[:, 2] = labelencoder.fit_transform(X[:, 2])
X[:, 5] = labelencoder.fit_transform(X[:, 5])
X[:, 6] = labelencoder.fit_transform(X[:, 6])
X[:, 8] = labelencoder.fit_transform(X[:, 8])
y = labelencoder.fit_transform(y)
# 划分数据集为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# 构建决策树模型
class DecisionTree:
def __init__(self, max_depth=5, min_samples_split=2, min_samples_leaf=1, criterion='entropy'):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.criterion = criterion
def fit(self, X, y):
self.n_features_ = X.shape[1]
self.tree_ = self._grow_tree(X, y)
def _grow_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_labels = len(np.unique(y))
# 如果满足停止条件,返回叶子节点
if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
leaf_value = self._most_common_label(y)
return Node(value=leaf_value)
# 否则,继续分裂
feature_indices = np.random.choice(n_features, self.n_features_, replace=False)
best_feature, best_threshold = self._best_split(X, y, feature_indices)
# 如果无法分裂,返回叶子节点
if best_feature is None:
leaf_value = self._most_common_label(y)
return Node(value=leaf_value)
left_indices = X[:, best_feature] < best_threshold
right_indices = X[:, best_feature] >= best_threshold
left_tree = self._grow_tree(X[left_indices], y[left_indices], depth+1)
right_tree = self._grow_tree(X[right_indices], y[right_indices], depth+1)
return Node(feature=best_feature, threshold=best_threshold, left_tree=left_tree, right_tree=right_tree)
def _best_split(self, X, y, feature_indices):
best_gain = -1
best_feature = None
best_threshold = None
for feature in feature_indices:
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
gain = self._information_gain(X, y, feature, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def _information_gain(self, X, y, feature, threshold):
parent_entropy = self._entropy(y)
left_indices = X[:, feature] < threshold
right_indices = X[:, feature] >= threshold
if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
return 0
left_entropy = self._entropy(y[left_indices])
right_entropy = self._entropy(y[right_indices])
child_entropy = (np.sum(left_indices) / len(y)) * left_entropy + (np.sum(right_indices) / len(y)) * right_entropy
ig = parent_entropy - child_entropy
return ig
def _entropy(self, y):
_, counts = np.unique(y, return_counts=True)
probabilities = counts / np.sum(counts)
entropy = -np.sum(probabilities * np.log2(probabilities))
return entropy
def _most_common_label(self, y):
_, counts = np.unique(y, return_counts=True)
most_common_label = y[np.argmax(counts)]
return most_common_label
def predict(self, X):
return [self._predict(inputs) for inputs in X]
def _predict(self, inputs):
node = self.tree_
while node.feature is not None:
if inputs[node.feature] < node.threshold:
node = node.left_tree
else:
node = node.right_tree
return node.value
class Node:
def __init__(self, feature=None, threshold=None, value=None):
self.feature = feature
self.threshold = threshold
self.value = value
self.left_tree = None
self.right_tree = None
# 训练模型
tree = DecisionTree()
tree.fit(X_train, y_train)
# 模型调优
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth': [3, 5, 7, 9], 'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4]}
grid_search = GridSearchCV(estimator=tree, param_grid=parameters, scoring='accuracy', cv=10)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
# 模型评估
y_pred = tree.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
# 可视化
from sklearn.tree import export_graphviz
from io import StringIO
import pydotplus
from IPython.display import Image
dot_data = StringIO()
export_graphviz(tree.tree_, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=list(data.columns)[:-1], class_names=['0', '1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
```
这个示例代码包括了基本的数据处理、决策树算法的建模、算法调优、模型评估和可视化。需要注意的是,这个实现并不是最优的,可能会存在一些问题,但可以作为一个简单的参考。
阅读全文