利用近邻法和决策树算法完成对Iris数据集的分类任务,其中近邻法需要实现剪辑近邻和压缩近邻两种优化。决策树算法需要至少实现ID3和C4.5两种;写出代码和设计思路
时间: 2024-01-22 11:18:30 浏览: 67
设计思路:
1. 数据预处理:读取数据集,将数据集分为训练集和测试集,并对数据进行归一化处理。
2. 实现剪辑近邻和压缩近邻两种优化方法,以提高KNN算法的分类准确率。
3. 实现ID3算法和C4.5算法,对数据集进行分类。
4. 对分类结果进行评估,计算分类的准确率。
代码实现:
1.数据预处理
```python
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 读取数据集
data = pd.read_csv('iris.data', header=None)
# 划分特征和标签
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# 归一化处理
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
```
2.实现剪辑近邻和压缩近邻两种优化方法
```python
# 剪辑近邻
def clip_neighbors(X_train, X_test, k):
y_pred = []
for i in range(len(X_test)):
distances = []
for j in range(len(X_train)):
distance = np.linalg.norm(X_test[i] - X_train[j])
distances.append((distance, y_train[j]))
distances.sort()
neighbors = distances[:k]
# 剪辑近邻
for neighbor in neighbors:
if neighbor[0] == 0:
y_pred.append(neighbor[1])
break
else:
y_pred.append(max(set([neighbor[1] for neighbor in neighbors]), key=[neighbor[1] for neighbor in neighbors].count))
return np.array(y_pred)
# 压缩近邻
def compress_neighbors(X_train, X_test, k):
y_pred = []
for i in range(len(X_test)):
distances = []
for j in range(len(X_train)):
distance = np.linalg.norm(X_test[i] - X_train[j])
distances.append((distance, y_train[j]))
distances.sort()
# 压缩近邻
neighbors = []
for neighbor in distances:
if neighbor[0] <= distances[k-1][0]:
neighbors.append(neighbor)
y_pred.append(max(set([neighbor[1] for neighbor in neighbors]), key=[neighbor[1] for neighbor in neighbors].count))
return np.array(y_pred)
```
3.实现ID3算法和C4.5算法
```python
# 计算信息熵
def calc_entropy(y):
labels = set(y)
entropy = 0
for label in labels:
p_label = y[y == label].size / y.size
entropy -= p_label * np.log2(p_label)
return entropy
# 计算条件熵
def calc_conditional_entropy(X, y, feature):
values = set(X[:, feature])
conditional_entropy = 0
for value in values:
sub_y = y[X[:, feature] == value]
p_value = sub_y.size / y.size
conditional_entropy += p_value * calc_entropy(sub_y)
return conditional_entropy
# 计算信息增益
def calc_information_gain(X, y, feature):
entropy = calc_entropy(y)
conditional_entropy = calc_conditional_entropy(X, y, feature)
information_gain = entropy - conditional_entropy
return information_gain
# ID3算法
def id3(X, y, features):
if len(set(y)) == 1:
return y[0]
if len(features) == 0:
return max(set(y), key=y.count)
information_gains = []
for feature in features:
information_gain = calc_information_gain(X, y, feature)
information_gains.append(information_gain)
best_feature_index = np.argmax(information_gains)
best_feature = features[best_feature_index]
tree = {best_feature: {}}
values = set(X[:, best_feature])
for value in values:
sub_X = X[X[:, best_feature] == value]
sub_y = y[X[:, best_feature] == value]
sub_tree = id3(sub_X, sub_y, np.delete(features, best_feature_index))
tree[best_feature][value] = sub_tree
return tree
# C4.5算法
def c45(X, y, features):
if len(set(y)) == 1:
return y[0]
if len(features) == 0:
return max(set(y), key=y.count)
information_gains_ratio = []
for feature in features:
information_gain = calc_information_gain(X, y, feature)
split_info = calc_entropy(X[:, feature])
if split_info == 0:
information_gains_ratio.append(information_gain)
else:
information_gain_ratio = information_gain / split_info
information_gains_ratio.append(information_gain_ratio)
best_feature_index = np.argmax(information_gains_ratio)
best_feature = features[best_feature_index]
tree = {best_feature: {}}
values = set(X[:, best_feature])
for value in values:
sub_X = X[X[:, best_feature] == value]
sub_y = y[X[:, best_feature] == value]
sub_tree = c45(sub_X, sub_y, np.delete(features, best_feature_index))
tree[best_feature][value] = sub_tree
return tree
```
4.对分类结果进行评估
```python
# KNN算法分类结果评估
def evaluate_knn(y_pred, y_test):
accuracy = sum(y_pred == y_test) / len(y_test)
return accuracy
# 决策树算法分类结果评估
def evaluate_tree(tree, X_test, y_test):
y_pred = []
for i in range(len(X_test)):
sub_tree = tree.copy()
while isinstance(sub_tree, dict):
feature = list(sub_tree.keys())[0]
value = X_test[i, feature]
sub_tree = sub_tree[feature][value]
y_pred.append(sub_tree)
accuracy = sum(y_pred == y_test) / len(y_test)
return accuracy
# 剪辑近邻KNN算法分类结果
y_pred = clip_neighbors(X_train, X_test, 3)
accuracy = evaluate_knn(y_pred, y_test)
print('剪辑近邻KNN算法分类准确率:', accuracy)
# 压缩近邻KNN算法分类结果
y_pred = compress_neighbors(X_train, X_test, 3)
accuracy = evaluate_knn(y_pred, y_test)
print('压缩近邻KNN算法分类准确率:', accuracy)
# ID3算法分类结果
tree = id3(X_train, y_train, np.arange(X_train.shape[1]))
accuracy = evaluate_tree(tree, X_test, y_test)
print('ID3算法分类准确率:', accuracy)
# C4.5算法分类结果
tree = c45(X_train, y_train, np.arange(X_train.shape[1]))
accuracy = evaluate_tree(tree, X_test, y_test)
print('C4.5算法分类准确率:', accuracy)
```
阅读全文