利用近邻法和决策树算法完成对Iris数据集的分类任务,其中近邻法需要实现剪辑近邻和压缩近邻两种优化。决策树算法需要至少实现ID3和C4.5两种。加上SVM算法,一共是至少五种算法。给出代码,所有算法写在同一个文件里方便检查对比, 实验过程要求: 1.scikit-learn等包辅助实现 ① 代码能够输出测试集的分类结果,采用F1作为评价指标 ② 理解算法结构,能够说明各个参数的作用 ③ 能够简单预测关键参数调整以后的变化趋势 ④ 能够根据不同要求修改模型结构并分析结果

以下是Python代码实现,包含五种算法的分类任务,并输出测试集的F1值: ```python import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors from sklearn.tree import DecisionTreeClassifier, export_graphviz from sklearn.metrics import f1_score from sklearn.svm import SVC # 读取Iris数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 剪辑近邻法 class PrunedNearestNeighbors(NearestNeighbors): def __init__(self, n_neighbors=5, radius=1.0): super(PrunedNearestNeighbors, self).__init__(n_neighbors=n_neighbors+1, radius=radius) self.n_neighbors = n_neighbors def predict(self, X_test, X_train, y_train): distances, indices = self.kneighbors(X_test) indices = indices[:, 1:] # 去掉自身 y_pred = [] for i in range(len(X_test)): nn_labels = y_train[indices[i]] nn_distances = distances[i, 1:] min_distance = nn_distances.min() if (nn_distances <= min_distance * np.sqrt(2)).sum() > self.n_neighbors: nn_labels = nn_labels[nn_distances <= min_distance * np.sqrt(2)] y_pred.append(np.bincount(nn_labels).argmax()) return np.array(y_pred) # 压缩近邻法 class CompressedNearestNeighbors(NearestNeighbors): def __init__(self, n_neighbors=5, radius=1.0): super(CompressedNearestNeighbors, self).__init__(n_neighbors=n_neighbors+1, radius=radius) self.n_neighbors = n_neighbors def predict(self, X_test, X_train, y_train): distances, indices = self.kneighbors(X_test) indices = indices[:, 1:] # 去掉自身 y_pred = [] for i in range(len(X_test)): nn_labels = y_train[indices[i]] nn_distances = distances[i, 1:] if len(nn_labels) > self.n_neighbors: idx = np.argsort(nn_distances) nn_labels = nn_labels[idx[:self.n_neighbors]] nn_distances = nn_distances[idx[:self.n_neighbors]] y_pred.append(np.bincount(nn_labels).argmax()) return np.array(y_pred) # ID3算法 class ID3: def __init__(self, max_depth=None): self.max_depth = max_depth def _entropy(self, y): _, counts = np.unique(y, return_counts=True) p = counts / len(y) return -np.sum(p * np.log2(p)) def _information_gain(self, X, y, feature): vals, counts = np.unique(X[:, feature], return_counts=True) p = counts / len(X) ent = np.array([self._entropy(y[X[:, feature] == v]) for v in vals]) return self._entropy(y) - np.sum(p * ent) def fit(self, X, y): self.n_features_ = X.shape[1] self.tree_ = self._build_tree(X, y) def _build_tree(self, X, y, depth=0): n_samples, n_features = X.shape if len(np.unique(y)) == 1: return y[0] if self.max_depth is not None and depth >= self.max_depth: return np.bincount(y).argmax() gains = np.array([self._information_gain(X, y, f) for f in range(n_features)]) best_feature = np.argmax(gains) tree = {best_feature: {}} for val in np.unique(X[:, best_feature]): mask = X[:, best_feature] == val sub_X, sub_y = X[mask], y[mask] tree[best_feature][val] = self._build_tree(sub_X, sub_y, depth+1) return tree def predict(self, X): return np.array([self._predict_one(x, self.tree_) for x in X]) def _predict_one(self, x, tree): if isinstance(tree, int): return tree feature = list(tree.keys())[0] val = x[feature] if val not in tree[feature]: return np.random.randint(np.max(list(tree[feature].values())) + 1) sub_tree = tree[feature][val] return self._predict_one(x, sub_tree) # C4.5算法 class C45: def __init__(self, max_depth=None, min_samples_split=2): self.max_depth = max_depth self.min_samples_split = min_samples_split def _entropy(self, y): _, counts = np.unique(y, return_counts=True) p = counts / len(y) return -np.sum(p * np.log2(p)) def _information_gain_ratio(self, X, y, feature): vals, counts = np.unique(X[:, feature], return_counts=True) p = counts / len(X) ent = np.array([self._entropy(y[X[:, feature] == v]) for v in vals]) iv = -np.sum(p * np.log2(p)) return (self._entropy(y) - np.sum(p * ent)) / iv def fit(self, X, y): self.n_features_ = X.shape[1] self.tree_ = self._build_tree(X, y) def _build_tree(self, X, y, depth=0): n_samples, n_features = X.shape if len(np.unique(y)) == 1: return y[0] if self.max_depth is not None and depth >= self.max_depth: return np.bincount(y).argmax() if n_samples < self.min_samples_split: return np.bincount(y).argmax() gains = np.array([self._information_gain_ratio(X, y, f) for f in range(n_features)]) best_feature = np.argmax(gains) tree = {best_feature: {}} for val in np.unique(X[:, best_feature]): mask = X[:, best_feature] == val sub_X, sub_y = X[mask], y[mask] tree[best_feature][val] = self._build_tree(sub_X, sub_y, depth+1) return tree def predict(self, X): return np.array([self._predict_one(x, self.tree_) for x in X]) def _predict_one(self, x, tree): if isinstance(tree, int): return tree feature = list(tree.keys())[0] val = x[feature] if val not in tree[feature]: return np.random.randint(np.max(list(tree[feature].values())) + 1) sub_tree = tree[feature][val] return self._predict_one(x, sub_tree) # SVM算法 class SupportVectorMachine: def __init__(self, C=1.0, kernel='rbf', gamma='scale'): self.C = C self.kernel = kernel self.gamma = gamma def fit(self, X, y): self.svm_ = SVC(C=self.C, kernel=self.kernel, gamma=self.gamma) self.svm_.fit(X, y) def predict(self, X): return self.svm_.predict(X) # 实验 models = {'KNN': KNeighborsClassifier(n_neighbors=5), 'Pruned KNN': PrunedNearestNeighbors(n_neighbors=5), 'Compressed KNN': CompressedNearestNeighbors(n_neighbors=5), 'ID3': ID3(max_depth=5), 'C4.5': C45(max_depth=5), 'SVM': SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')} for name, model in models.items(): print(f'{name}:') model.fit(X_train, y_train) y_pred = model.predict(X_test) f1 = f1_score(y_test, y_pred, average='weighted') print(f'F1 Score: {f1:.4f}') print('\n') ``` 解释各个算法的作用与参数: 1. 近邻法:通过选择距离最近的样本来进行预测。需要设置的参数是邻居数和半径,剪辑近邻和压缩近邻是优化方法,可以减少异常值和重复样本的影响。 2. 决策树算法:通过构建决策树来进行预测。需要设置的参数是最大深度和最小样本数,ID3和C4.5是两种不同的算法,C4.5在ID3的基础上加入了信息增益比的概念。 3. SVM算法:通过找到最优的超平面来进行预测。需要设置的参数是正则化参数C和核函数的类型和参数。 预测关键参数调整以后的变化趋势: 1. 近邻法:邻居数增加会使预测变得更加稳定,但也会增加时间复杂度;半径增加则会增加异常值的影响。 2. 决策树算法:最大深度增加会使模型更加复杂,可能会导致过拟合;最小样本数增加会使模型更加简单,可能会导致欠拟合。 3. SVM算法:正则化参数C增加会使模型更加关注正确分类,可能会导致过拟合;核函数的类型和参数的选择会影响模型的表现。 根据不同要求修改模型结构并分析结果:可以根据具体问题的特点来选择不同的算法和参数,比如如果数据集中存在异常值,则可以选择剪辑近邻法进行预测;如果数据集较小,则可以选择决策树算法;如果数据集较大,则可以选择SVM算法。同时,也需要注意模型的泛化能力和效率,避免过拟合和欠拟合的问题。




可以使用matplotlib库来绘制这个函数的图像。以下是一段示例代码: ```python import numpy as np import matplotlib.pyplot as plt def func(x): return np.exp(-x) * np.sin(2 * np.pi * x) x = np.linspace(0, 5, 500) y = func(x) plt.plot(x, y) plt.xlabel('x') plt.ylabel('y') plt.title('y = e^{-x} sin(2πx)') plt.show() ```

