数据挖掘决策树分类预测自己写的代码
时间: 2023-07-05 10:26:47 浏览: 167
好的,以下是一个基于纯 Python 实现的简单的数据挖掘决策树分类预测代码示例:
```python
import csv
import math
# 读取数据集
def load_csv(file_path):
with open(file_path, 'r') as f:
lines = csv.reader(f)
dataset = list(lines)
return dataset
# 将字符串类型的特征值转换为浮点型
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# 将字符串类型的类别标签转换为整型
def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup
# 按照特征值分割数据集
def split_dataset(dataset, feature_index, value):
left, right = [], []
for row in dataset:
if row[feature_index] < value:
left.append(row)
else:
right.append(row)
return left, right
# 计算基尼指数
def gini_index(groups, classes):
n_instances = float(sum([len(group) for group in groups]))
gini = 0.0
for group in groups:
size = float(len(group))
if size == 0:
continue
score = 0.0
for class_val in classes:
p = [row[-1] for row in group].count(class_val) / size
score += p * p
gini += (1.0 - score) * (size / n_instances)
return gini
# 选择最佳的分割点
def get_split(dataset):
class_values = list(set(row[-1] for row in dataset))
best_feature_index, best_value, best_score, best_groups = 999, 999, 999, None
for feature_index in range(len(dataset[0])-1):
for row in dataset:
groups = split_dataset(dataset, feature_index, row[feature_index])
gini = gini_index(groups, class_values)
if gini < best_score:
best_feature_index, best_value, best_score, best_groups = feature_index, row[feature_index], gini, groups
return {'feature_index': best_feature_index, 'value': best_value, 'groups': best_groups}
# 创建叶节点
def to_terminal(group):
outcomes = [row[-1] for row in group]
return max(set(outcomes), key=outcomes.count)
# 创建子分割或叶节点
def split(node, max_depth, min_size, depth):
left, right = node['groups']
del(node['groups'])
if not left or not right:
node['left'] = node['right'] = to_terminal(left + right)
return
if depth >= max_depth:
node['left'], node['right'] = to_terminal(left), to_terminal(right)
return
if len(left) <= min_size:
node['left'] = to_terminal(left)
else:
node['left'] = get_split(left)
split(node['left'], max_depth, min_size, depth+1)
if len(right) <= min_size:
node['right'] = to_terminal(right)
else:
node['right'] = get_split(right)
split(node['right'], max_depth, min_size, depth+1)
# 创建决策树
def build_tree(train, max_depth, min_size):
root = get_split(train)
split(root, max_depth, min_size, 1)
return root
# 决策树分类预测
def predict(node, row):
if row[node['feature_index']] < node['value']:
if isinstance(node['left'], dict):
return predict(node['left'], row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], row)
else:
return node['right']
# 决策树算法
def decision_tree(train, test, max_depth, min_size):
tree = build_tree(train, max_depth, min_size)
predictions = []
for row in test:
prediction = predict(tree, row)
predictions.append(prediction)
return predictions
# 测试决策树算法
def test_decision_tree():
# 加载数据集
dataset = load_csv('data.csv')
# 将字符串类型的特征值转换为浮点型
for i in range(len(dataset[0])):
str_column_to_float(dataset, i)
# 将字符串类型的类别标签转换为整型
str_column_to_int(dataset, len(dataset[0])-1)
# 分割数据集
train, test = train_test_split(dataset, test_size=0.2)
# 训练模型并进行预测
predictions = decision_tree(train, test, max_depth=5, min_size=10)
# 计算准确率
correct = 0
for i in range(len(test)):
if test[i][-1] == predictions[i]:
correct += 1
accuracy = correct / float(len(test)) * 100.0
print('准确率:', accuracy)
if __name__ == '__main__':
test_decision_tree()
```
其中,`data.csv` 是数据集文件名。你需要根据你自己的数据集来修改这个参数。在 `decision_tree()` 函数中,`max_depth` 和 `min_size` 分别代表决策树的最大深度和叶节点最少样本数,你可以根据自己的数据集来调整这些参数。
阅读全文