利用近邻法和决策树算法完成对Iris数据集的分类任务,其中近邻法需要实现剪辑近邻和压缩近邻两种优化。决策树算法需要至少实现ID3和C4.5两种。算上之前课程里已经要求大家做过的SVM算法,一共是至少五种算法。 实验过程要求 scikit-learn等包辅助实现 ① 代码能够输出测试集的分类结果,采用F1作为评价指标 ② 理解算法结构,能够说明各个参数的作用 ③ 能够简单预测关键参数调整以后的变化趋势 ④ 能够根据不同要求修改模型结构并分析结果 该实验的实现过程
时间: 2024-01-21 12:19:49 浏览: 25
1. 数据预处理
首先,我们需要加载Iris数据集并进行数据预处理。这包括将标签转换为数字、将数据集分成训练集和测试集,以及标准化特征值。
```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
iris = load_iris()
X = iris.data
y = iris.target
# 标签转换为数字
y[y==0] = -1
# 将数据集分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 标准化特征值
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
```
2. 剪辑近邻算法
接下来,我们实现剪辑近邻算法。在这种算法中,我们只考虑最接近的K个邻居,并删除与测试样本距离最远的邻居。
```python
import numpy as np
class ClipKNN:
def __init__(self, k=3):
self.k = k
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
y_pred = np.zeros(X_test.shape[0])
for i, x_test in enumerate(X_test):
# 计算测试样本和所有训练样本的距离
distances = np.sqrt(np.sum((self.X_train - x_test)**2, axis=1))
# 找到最近的k个邻居
k_nearest_neighbors = self.y_train[np.argsort(distances)[:self.k]]
# 计算每个类别的票数
votes = np.bincount(k_nearest_neighbors.astype('int'))
# 预测测试样本的类别
y_pred[i] = np.argmax(votes)
# 剪辑最远的邻居
if i > self.k - 1:
farthest_neighbor_index = np.argmax(distances[:i])
distances[farthest_neighbor_index] = 0
k_nearest_neighbors = self.y_train[np.argsort(distances)[:self.k]]
votes = np.bincount(k_nearest_neighbors.astype('int'))
y_pred[i] = np.argmax(votes)
return y_pred
```
3. 压缩近邻算法
接下来,我们实现压缩近邻算法。在这种算法中,我们将距离测试样本较远的邻居舍弃,并将距离测试样本较近的邻居压缩为一个虚拟样本。
```python
class CompressedKNN:
def __init__(self, k=3, alpha=0.5):
self.k = k
self.alpha = alpha
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
y_pred = np.zeros(X_test.shape[0])
for i, x_test in enumerate(X_test):
# 计算测试样本和所有训练样本的距离
distances = np.sqrt(np.sum((self.X_train - x_test)**2, axis=1))
# 压缩近邻
sorted_indices = np.argsort(distances)
compressed_indices = sorted_indices[:self.k]
compressed_distances = distances[compressed_indices]
compressed_labels = self.y_train[compressed_indices]
for j in range(self.k, len(distances)):
if distances[sorted_indices[j]] / compressed_distances[-1] <= self.alpha:
compressed_distances = np.append(compressed_distances, distances[sorted_indices[j]])
compressed_labels = np.append(compressed_labels, self.y_train[sorted_indices[j]])
k_nearest_neighbors = compressed_labels
# 计算每个类别的票数
votes = np.bincount(k_nearest_neighbors.astype('int'))
# 预测测试样本的类别
y_pred[i] = np.argmax(votes)
return y_pred
```
4. ID3算法
接下来,我们实现ID3算法。在这种算法中,我们使用信息增益来选择最佳特征,并递归地构建决策树。
```python
from collections import Counter
import math
class Node:
def __init__(self, feature_idx=None, threshold=None, label=None, left=None, right=None):
self.feature_idx = feature_idx
self.threshold = threshold
self.label = label
self.left = left
self.right = right
class ID3:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def fit(self, X_train, y_train):
self.num_classes = len(np.unique(y_train))
self.root = self.build_tree(X_train, y_train)
def build_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
num_labels = len(np.unique(y))
# 停止条件
if depth == self.max_depth or num_labels == 1 or num_samples < self.min_samples_split:
label = Counter(y).most_common(1)[0][0]
return Node(label=label)
# 选择最佳特征
best_feature_idx, best_threshold = self.choose_best_feature(X, y)
# 递归构建决策树
left_indices = X[:, best_feature_idx] < best_threshold
right_indices = X[:, best_feature_idx] >= best_threshold
left = self.build_tree(X[left_indices], y[left_indices], depth+1)
right = self.build_tree(X[right_indices], y[right_indices], depth+1)
return Node(best_feature_idx, best_threshold, left=left, right=right)
def choose_best_feature(self, X, y):
best_feature_idx, best_threshold, max_info_gain = None, None, -float('inf')
num_samples, num_features = X.shape
for feature_idx in range(num_features):
thresholds = np.unique(X[:, feature_idx])
for threshold in thresholds:
# 计算信息增益
left_indices = X[:, feature_idx] < threshold
right_indices = X[:, feature_idx] >= threshold
if sum(left_indices) == 0 or sum(right_indices) == 0:
continue
info_gain = self.calculate_info_gain(y, left_indices, right_indices)
# 更新最佳特征和阈值
if info_gain > max_info_gain:
best_feature_idx = feature_idx
best_threshold = threshold
max_info_gain = info_gain
return best_feature_idx, best_threshold
def calculate_info_gain(self, y, left_indices, right_indices):
num_samples = len(y)
left_weight = sum(left_indices) / num_samples
right_weight = sum(right_indices) / num_samples
left_entropy = self.calculate_entropy(y[left_indices])
right_entropy = self.calculate_entropy(y[right_indices])
info_gain = self.calculate_entropy(y) - left_weight * left_entropy - right_weight * right_entropy
return info_gain
def calculate_entropy(self, y):
num_samples = len(y)
_, counts = np.unique(y, return_counts=True)
probabilities = counts / num_samples
entropy = sum(probabilities * -np.log2(probabilities))
return entropy
def predict(self, X_test):
y_pred = np.zeros(X_test.shape[0])
for i, x_test in enumerate(X_test):
node = self.root
while node.label is None:
if x_test[node.feature_idx] < node.threshold:
node = node.left
else:
node = node.right
y_pred[i] = node.label
return y_pred
```
5. C4.5算法
接下来,我们实现C4.5算法。在这种算法中,我们使用信息增益比来选择最佳特征,并递归地构建决策树。
```python
class C45:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def fit(self, X_train, y_train):
self.num_classes = len(np.unique(y_train))
self.root = self.build_tree(X_train, y_train)
def build_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
num_labels = len(np.unique(y))
# 停止条件
if depth == self.max_depth or num_labels == 1 or num_samples < self.min_samples_split:
label = Counter(y).most_common(1)[0][0]
return Node(label=label)
# 选择最佳特征
best_feature_idx, best_threshold = self.choose_best_feature(X, y)
# 递归构建决策树
left_indices = X[:, best_feature_idx] < best_threshold
right_indices = X[:, best_feature_idx] >= best_threshold
left = self.build_tree(X[left_indices], y[left_indices], depth+1)
right = self.build_tree(X[right_indices], y[right_indices], depth+1)
return Node(best_feature_idx, best_threshold, left=left, right=right)
def choose_best_feature(self, X, y):
best_feature_idx, best_threshold, max_info_gain_ratio = None, None, -float('inf')
num_samples, num_features = X.shape
for feature_idx in range(num_features):
thresholds = np.unique(X[:, feature_idx])
for threshold in thresholds:
# 计算信息增益比
left_indices = X[:, feature_idx] < threshold
right_indices = X[:, feature_idx] >= threshold
if sum(left_indices) == 0 or sum(right_indices) == 0:
continue
info_gain = self.calculate_info_gain(y, left_indices, right_indices)
split_information = self.calculate_split_information(y, left_indices, right_indices)
info_gain_ratio = info_gain / split_information
# 更新最佳特征和阈值
if info_gain_ratio > max_info_gain_ratio:
best_feature_idx = feature_idx
best_threshold = threshold
max_info_gain_ratio = info_gain_ratio
return best_feature_idx, best_threshold
def calculate_info_gain(self, y, left_indices, right_indices):
num_samples = len(y)
left_weight = sum(left_indices) / num_samples
right_weight = sum(right_indices) / num_samples
left_entropy = self.calculate_entropy(y[left_indices])
right_entropy = self.calculate_entropy(y[right_indices])
info_gain = self.calculate_entropy(y) - left_weight * left_entropy - right_weight * right_entropy
return info_gain
def calculate_split_information(self, y, left_indices, right_indices):
num_samples = len(y)
left_weight = sum(left_indices) / num_samples
right_weight = sum(right_indices) / num_samples
split_information = -left_weight * np.log2(left_weight) - right_weight * np.log2(right_weight)
return split_information
def calculate_entropy(self, y):
num_samples = len(y)
_, counts = np.unique(y, return_counts=True)
probabilities = counts / num_samples
entropy = sum(probabilities * -np.log2(probabilities))
return entropy
def predict(self, X_test):
y_pred = np.zeros(X_test.shape[0])
for i, x_test in enumerate(X_test):
node = self.root
while node.label is None:
if x_test[node.feature_idx] < node.threshold:
node = node.left
else:
node = node.right
y_pred[i] = node.label
return y_pred
```
6. F1评价指标
在实现以上算法后,我们需要定义F1评价指标来衡量模型的性能。
```python
from sklearn.metrics import f1_score
def evaluate(y_true, y_pred):
return f1_score(y_true, y_pred)
```
7. 测试模型
最后,我们可以测试我们实现的算法并评估它们的性能。
```python
clip_knn = ClipKNN(k=3)
clip_knn.fit(X_train, y_train)
clip_knn_y_pred = clip_knn.predict(X_test)
print('ClipKNN F1 score:', evaluate(y_test, clip_knn_y_pred))
compressed_knn = CompressedKNN(k=3, alpha=0.5)
compressed_knn.fit(X_train, y_train)
compressed_knn_y_pred = compressed_knn.predict(X_test)
print('CompressedKNN F1 score:', evaluate(y_test, compressed_knn_y_pred))
id3 = ID3(max_depth=5, min_samples_split=2)
id3.fit(X_train, y_train)
id3_y_pred = id3.predict(X_test)
print('ID3 F1 score:', evaluate(y_test, id3_y_pred))
c45 = C45(max_depth=5, min_samples_split=2)
c45.fit(X_train, y_train)
c45_y_pred = c45.predict(X_test)
print('C45 F1 score:', evaluate(y_test, c45_y_pred))
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_y_pred = svm.predict(X_test)
print('SVM F1 score:', evaluate(y_test, svm_y_pred))
```
我们可以根据F1分数比较不同算法的性能,并根据需要调整算法的参数和结构。