python不调用库实现留出法验证CART决策树,以titanic数据集为例
时间: 2024-01-28 15:03:00 浏览: 91
基于kaggle上Titanic数据集实现的ID3、C4.5、CART和CART剪枝算法.zip
以下是一个使用Python实现留出法验证CART决策树的示例代码,以titanic数据集为例:
```python
import pandas as pd
import numpy as np
# 读取titanic数据集
data = pd.read_csv('titanic.csv')
# 数据预处理
data['Age'].fillna(data['Age'].mean(), inplace=True)
data.dropna(subset=['Embarked'], inplace=True)
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
# 划分训练集和测试集
train_data = data.sample(frac=0.8, random_state=1)
test_data = data.drop(train_data.index)
# 定义决策树节点类
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature # 划分的特征
self.threshold = threshold # 划分的阈值
self.left = left # 左子树
self.right = right # 右子树
self.value = value # 叶子节点的值
# 定义CART决策树类
class CARTDecisionTree:
def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
self.max_depth = max_depth # 最大深度
self.min_samples_split = min_samples_split # 内部节点划分所需最小样本数
self.min_samples_leaf = min_samples_leaf # 叶子节点所需最小样本数
self.root = None # 决策树的根节点
# 计算基尼指数
def _gini(self, y):
n = y.shape[0]
labels = np.unique(y)
gini = 0
for label in labels:
p = np.sum(y == label) / n
gini += p * (1 - p)
return gini
# 计算基尼指数增益
def _gini_gain(self, X, y, threshold):
n = y.shape[0]
y_left = y[X < threshold]
y_right = y[X >= threshold]
gini_gain = self._gini(y) - y_left.shape[0] / n * self._gini(y_left) - y_right.shape[0] / n * self._gini(y_right)
return gini_gain
# 找到最佳划分点
def _best_split(self, X, y):
best_feature, best_threshold, best_gain = None, None, -1
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
gain = self._gini_gain(X[:, feature], y, threshold)
if gain > best_gain:
best_feature, best_threshold, best_gain = feature, threshold, gain
return best_feature, best_threshold, best_gain
# 构建决策树
def _build_tree(self, X, y, depth):
n_samples, n_features = X.shape
# 如果样本数小于等于最小的样本数或者达到了最大深度,返回叶子节点
if n_samples < self.min_samples_leaf or depth == self.max_depth:
value = np.mean(y)
return Node(value=value)
# 如果样本数大于最小的样本数,找到最佳划分点
feature, threshold, gain = self._best_split(X, y)
# 如果划分增益小于等于0,返回叶子节点
if gain <= 0:
value = np.mean(y)
return Node(value=value)
# 如果划分增益大于0,继续递归构建子树
X_left, y_left = X[X[:, feature] < threshold], y[X[:, feature] < threshold]
X_right, y_right = X[X[:, feature] >= threshold], y[X[:, feature] >= threshold]
left = self._build_tree(X_left, y_left, depth+1)
right = self._build_tree(X_right, y_right, depth+1)
return Node(feature=feature, threshold=threshold, left=left, right=right)
# 训练模型
def fit(self, X, y):
self.root = self._build_tree(X, y, 0)
# 预测单个样本
def _predict_sample(self, x):
node = self.root
while node.left:
if x[node.feature] < node.threshold:
node = node.left
else:
node = node.right
return node.value
# 预测多个样本
def predict(self, X):
return np.array([self._predict_sample(x) for x in X])
# 划分训练集和测试集的特征和标签
train_X = train_data.drop('Survived', axis=1).values
train_y = train_data['Survived'].values
test_X = test_data.drop('Survived', axis=1).values
test_y = test_data['Survived'].values
# 创建CART决策树对象并训练模型
tree = CARTDecisionTree(max_depth=5, min_samples_split=10, min_samples_leaf=5)
tree.fit(train_X, train_y)
# 在测试集上进行预测
y_pred = tree.predict(test_X)
accuracy = np.mean(y_pred == test_y)
print('Accuracy:', accuracy)
```
以上代码中,首先读取titanic数据集并进行数据预处理,然后使用`sample`方法将数据集划分为训练集和测试集,接着定义了一个`Node`类表示决策树的节点,以及一个`CARTDecisionTree`类表示CART决策树,其中包括计算基尼指数、找到最佳划分点、构建决策树、训练模型、预测单个样本和预测多个样本等方法,最后创建一个`CARTDecisionTree`对象并在训练集上训练模型,然后在测试集上进行预测并计算准确率。
阅读全文