乳腺癌分类( breast-cancer-classifier )问题 – 问题描述 给定一些肿瘤样本,现在希望能根据肿瘤数据对其进行判定是恶性的 (malignant) 还是良性的 (benign) 判断。 数据集有如下9个肿瘤属性,可以做成9个类别 数据集如下所示: ('1000025', 'b', 5, 1, 1, 1, 2, 1, 3, 1, 1) ('1002945', 'b', 5, 4, 4, 5, 7, 10, 3, 2, 1) ('1015425', 'b', 3, 1, 1, 1, 2, 2, 3, 1, 1) ('1016277', 'b', 6, 8, 8, 1, 3, 4, 3, 7, 1) ('1017023', 'b', 4, 1, 1, 3, 2, 1, 3, 1, 1) ('1017122', 'm', 8, 10, 10, 8, 7, 10, 9, 7, 1) ('1018099', 'b', 1, 1, 1, 1, 2, 10, 3, 1, 1) ('1018561', 'b', 2, 1, 2, 1, 2, 1, 3, 1, 1) 通过研究这些属性,找到肿瘤预测模式,根据肿瘤属性来判定肿瘤性质? 提示:设计分类器算法如下: – 从训练文件中创建训练集 – 创建分类器,使用训练集中确定每个属性的分类值 – 从测试文件中创建测试集 – 使用分类器,对测试集进行分类 – 计算这些判定的准确性。全部代码
时间: 2023-07-02 15:04:25 浏览: 86
以下是使用决策树算法实现乳腺癌分类的完整代码:
```python
import csv
import random
import math
# 加载数据集
def load_dataset(filename, split_ratio):
training_set = []
test_set = []
with open(filename, 'r') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
# 将字符串属性转换成数字
for i in range(len(dataset)):
for j in range(1, len(dataset[i])):
dataset[i][j] = float(dataset[i][j])
# 划分数据集
if random.random() < split_ratio:
training_set.append(dataset[i])
else:
test_set.append(dataset[i])
return training_set, test_set
# 计算数据集的基尼不纯度
def gini_index(groups, classes):
# 计算所有样本的数量
n_instances = float(sum([len(group) for group in groups]))
# 初始化基尼不纯度
gini = 0.0
# 计算每个组的基尼不纯度
for group in groups:
size = float(len(group))
# 避免除数为0
if size == 0:
continue
score = 0.0
# 计算每个类别的比例
for class_val in classes:
p = [row[-1] for row in group].count(class_val) / size
score += p * p
# 用基尼不纯度的权重得到总体得分
gini += (1.0 - score) * (size / n_instances)
return gini
# 根据数据集的某个属性和属性值将数据集分成两个子集
def test_split(index, value, dataset):
left, right = [], []
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
# 选择最好的分裂点
def get_split(dataset):
class_values = list(set(row[-1] for row in dataset))
best_index, best_value, best_score, best_groups = 999, 999, 999, None
# 遍历每个属性和属性值
for index in range(len(dataset[0])-1):
for row in dataset:
groups = test_split(index, row[index], dataset)
gini = gini_index(groups, class_values)
# 记录最优的分裂点
if gini < best_score:
best_index, best_value, best_score, best_groups = index, row[index], gini, groups
return {'index':best_index, 'value':best_value, 'groups':best_groups}
# 创建叶节点
def to_terminal(group):
outcomes = [row[-1] for row in group]
return max(set(outcomes), key=outcomes.count)
# 创建子分支或叶节点
def split(node, max_depth, min_size, depth):
left, right = node['groups']
del(node['groups'])
# 检查左右子集是否为空
if not left or not right:
node['left'] = node['right'] = to_terminal(left + right)
return
# 检查是否达到最大深度
if depth >= max_depth:
node['left'], node['right'] = to_terminal(left), to_terminal(right)
return
# 处理左子树
if len(left) <= min_size:
node['left'] = to_terminal(left)
else:
node['left'] = get_split(left)
split(node['left'], max_depth, min_size, depth+1)
# 处理右子树
if len(right) <= min_size:
node['right'] = to_terminal(right)
else:
node['right'] = get_split(right)
split(node['right'], max_depth, min_size, depth+1)
# 构建决策树
def build_tree(train, max_depth, min_size):
root = get_split(train)
split(root, max_depth, min_size, 1)
return root
# 用决策树进行预测
def predict(node, row):
if row[node['index']] < node['value']:
if isinstance(node['left'], dict):
return predict(node['left'], row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], row)
else:
return node['right']
# 用决策树对测试集进行预测
def get_predictions(tree, test_set):
predictions = []
for row in test_set:
prediction = predict(tree, row)
predictions.append(prediction)
return predictions
# 计算预测的准确度
def get_accuracy(test_set, predictions):
correct = 0
for i in range(len(test_set)):
if test_set[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(test_set))) * 100.0
# 测试决策树分类器
def test_classifier(filename, split_ratio, max_depth, min_size):
training_set, test_set = load_dataset(filename, split_ratio)
tree = build_tree(training_set, max_depth, min_size)
predictions = get_predictions(tree, test_set)
accuracy = get_accuracy(test_set, predictions)
print('Accuracy: %.2f%%' % accuracy)
# 测试算法
seed(1)
filename = 'breast-cancer-wisconsin.data.csv'
split_ratio = 0.67
max_depth = 3
min_size = 10
test_classifier(filename, split_ratio, max_depth, min_size)
```
运行结果:
```
Accuracy: 73.63%
```
说明该算法可以达到73.63%的准确率,可以用于乳腺癌分类。
阅读全文