简单数据挖掘ID3决策树分类预测python代码
时间: 2023-07-05 10:15:45 浏览: 87
以下是一个简单的基于ID3算法的决策树分类预测的Python代码:
```python
import pandas as pd
import numpy as np
import math
# 定义计算熵的函数
def calc_entropy(data):
label_col = data.iloc[:, -1]
_, counts = np.unique(label_col, return_counts=True)
probs = counts / len(label_col)
entropy = sum(probs * -np.log2(probs))
return entropy
# 定义计算信息增益的函数
def calc_info_gain(data, feature):
entropy_before_split = calc_entropy(data)
vals, counts = np.unique(data[feature], return_counts=True)
probs = counts / sum(counts)
entropy_after_split = 0
for i in range(len(vals)):
sub_data = data[data[feature] == vals[i]]
entropy_after_split += probs[i] * calc_entropy(sub_data)
info_gain = entropy_before_split - entropy_after_split
return info_gain
# 定义获取最佳切分特征的函数
def get_best_split_feature(data):
features = data.columns[:-1]
best_feature = None
best_info_gain = -1
for feature in features:
info_gain = calc_info_gain(data, feature)
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = feature
return best_feature
# 定义决策树训练函数
def train_decision_tree(data):
# 终止条件1:如果数据集中所有样本都属于同一类别,直接返回该类别
if len(np.unique(data.iloc[:, -1])) == 1:
return np.unique(data.iloc[:, -1])[0]
# 终止条件2:如果数据集中没有特征可供切分,直接返回样本数最多的类别
if len(data.columns) == 1:
return np.bincount(data.iloc[:, -1]).argmax()
# 选择最佳切分特征
best_feature = get_best_split_feature(data)
# 构建决策树
decision_tree = {best_feature: {}}
vals, counts = np.unique(data[best_feature], return_counts=True)
for i in range(len(vals)):
# 去掉最佳切分特征,递归构建子树
sub_data = data[data[best_feature] == vals[i]].drop(best_feature, axis=1)
decision_tree[best_feature][vals[i]] = train_decision_tree(sub_data)
return decision_tree
# 定义决策树预测函数
def predict(data, tree):
for feature, subtree in tree.items():
val = data[feature]
if val not in subtree:
return np.bincount(data.iloc[:, -1]).argmax()
subtree = subtree[val]
if isinstance(subtree, dict):
return predict(data.drop(feature, axis=1), subtree)
else:
return subtree
# 加载数据集
data = pd.read_csv('data.csv')
# 训练决策树
decision_tree = train_decision_tree(data)
# 预测新样本
new_data = pd.DataFrame({'feature1': [1, 1, 0, 0], 'feature2': [1, 0, 1, 0]})
for i in range(len(new_data)):
prediction = predict(new_data.iloc[i], decision_tree)
print('Sample', i+1, 'predicted class:', prediction)
```
注:此代码仅作为学习参考,实际应用中需要考虑更多的细节和优化措施。
阅读全文