写一个sklearn调用决策树模块2.自编程实现ID3,CART算法 对贷款申请样本数据表进行预测分类
时间: 2024-06-10 18:06:17 浏览: 199
回答:
1. 使用sklearn调用决策树模块的代码:
```python
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
# 加载样本数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 建立决策树分类器
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
# 预测测试集结果
y_pred = clf.predict(X_test)
print("预测结果:", y_pred)
```
2. 自编程实现ID3,CART算法对贷款申请样本数据表进行预测分类,可以参考以下代码:
```python
import pandas as pd
import numpy as np
import math
# 数据集
dataset = pd.DataFrame({
'income': ['high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'high', 'medium', 'low'],
'age': ['young', 'young', 'young', 'middle_aged', 'senior', 'senior', 'middle_aged', 'middle_aged', 'senior', 'senior'],
'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes'],
'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair'],
'class': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes']
})
# 计算信息熵
def calc_entropy(data):
n = data.shape[0]
label_count = data.iloc[:, -1].value_counts()
p = label_count / n
entropy = (-p * np.log2(p)).sum()
return entropy
# 计算条件熵
def calc_conditional_entropy(data, feature):
n = data.shape[0]
H_D_A = 0
feature_values = set(data[feature])
for feature_value in feature_values:
sub_data = data[data[feature] == feature_value]
H_D_A += sub_data.shape[0]/n * calc_entropy(sub_data)
return H_D_A
# 计算信息增益
def calc_info_gain(data, feature):
return calc_entropy(data) - calc_conditional_entropy(data, feature)
# 计算信息增益比
def calc_info_gain_ratio(data, feature):
info_gain = calc_info_gain(data, feature)
return info_gain / calc_entropy(data)
# 决策树节点
class TreeNode:
def __init__(self, name: str = None, feature: str = None, value=None, is_leaf=False):
self.name = name # 节点名称
self.feature = feature # 特征名称
self.value = value # 特征取值
self.children = [] # 孩子
self.is_leaf = is_leaf # 是否是叶子结点
# ID3算法
def ID3(data, epsilon=0.1):
features = set(data.columns) - set(['class'])
max_info_gain = 0
best_feature = None
for feature in features:
cur_info_gain = calc_info_gain(data, feature)
if cur_info_gain > max_info_gain:
max_info_gain = cur_info_gain
best_feature = feature
if best_feature is None:
return TreeNode(name='leaf', value=data.iloc[0, -1], is_leaf=True)
node = TreeNode(name='node', feature=best_feature)
feature_values = set(data[best_feature])
for feature_value in feature_values:
sub_data = data[data[best_feature] == feature_value]
sub_tree = ID3(sub_data, epsilon)
sub_tree.value = feature_value
node.children.append(sub_tree)
return node
# CART算法
def CART(data, epsilon=0.1):
features = set(data.columns) - set(['class'])
max_info_gain_ratio = 0
best_feature = None
for feature in features:
cur_info_gain_ratio = calc_info_gain_ratio(data, feature)
if cur_info_gain_ratio > max_info_gain_ratio:
max_info_gain_ratio = cur_info_gain_ratio
best_feature = feature
if best_feature is None:
return TreeNode(name='leaf', value=data.iloc[0, -1], is_leaf=True)
node = TreeNode(name='node', feature=best_feature)
left_data = data[data[best_feature] == data.iloc[0][best_feature]]
right_data = data[data[best_feature] != data.iloc[0][best_feature]]
node.children.append(CART(left_data, epsilon))
node.children.append(CART(right_data, epsilon))
return node
# 预测
def predict(tree: TreeNode, sample):
while not tree.is_leaf:
feature_value = sample[tree.feature]
for child_node in tree.children:
if child_node.value == feature_value:
tree = child_node
break
return tree.value
# 构建决策树
tree_id3 = ID3(dataset)
tree_cart = CART(dataset)
# 预测样本数据
sample_data = pd.DataFrame({
'income': ['medium'],
'age': ['young'],
'student': ['no'],
'credit_rating': ['fair']
})
print("ID3预测结果:", predict(tree_id3, sample_data))
print("CART预测结果:", predict(tree_cart, sample_data))
```
注:以上代码仅作为提供参考的示例代码,仅仅支持对已知数据集进行预测,需要根据实际情况进行修改和优化。
阅读全文