利用近邻法和决策树算法完成对Iris数据集的分类任务，其中近邻法需要实现剪辑近邻和压缩近邻两种优化。决策树算法需要至少实现ID3和C4.5两种。加上SVM算法，一共是至少五种算法。给出代码，所有算法写在同一个文件里方便检查对比，实验过程要求： 1.scikit-learn等包辅助实现 ① 代码能够输出测试集的分类结果，采用F1作为评价指标 ② 理解算法结构，能够说明各个参数的作用 ③ 能够简单预测关键参数调整以后的变化趋势 ④ 能够根据不同要求修改模型结构并分析结果

时间: 2024-01-21 17:20:05 浏览: 91

以下是Python代码实现，包含五种算法的分类任务，并输出测试集的F1值： ```python import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors from sklearn.tree import DecisionTreeClassifier, export_graphviz from sklearn.metrics import f1_score from sklearn.svm import SVC # 读取Iris数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 剪辑近邻法 class PrunedNearestNeighbors(NearestNeighbors): def __init__(self, n_neighbors=5, radius=1.0): super(PrunedNearestNeighbors, self).__init__(n_neighbors=n_neighbors+1, radius=radius) self.n_neighbors = n_neighbors def predict(self, X_test, X_train, y_train): distances, indices = self.kneighbors(X_test) indices = indices[:, 1:] # 去掉自身 y_pred = [] for i in range(len(X_test)): nn_labels = y_train[indices[i]] nn_distances = distances[i, 1:] min_distance = nn_distances.min() if (nn_distances <= min_distance * np.sqrt(2)).sum() > self.n_neighbors: nn_labels = nn_labels[nn_distances <= min_distance * np.sqrt(2)] y_pred.append(np.bincount(nn_labels).argmax()) return np.array(y_pred) # 压缩近邻法 class CompressedNearestNeighbors(NearestNeighbors): def __init__(self, n_neighbors=5, radius=1.0): super(CompressedNearestNeighbors, self).__init__(n_neighbors=n_neighbors+1, radius=radius) self.n_neighbors = n_neighbors def predict(self, X_test, X_train, y_train): distances, indices = self.kneighbors(X_test) indices = indices[:, 1:] # 去掉自身 y_pred = [] for i in range(len(X_test)): nn_labels = y_train[indices[i]] nn_distances = distances[i, 1:] if len(nn_labels) > self.n_neighbors: idx = np.argsort(nn_distances) nn_labels = nn_labels[idx[:self.n_neighbors]] nn_distances = nn_distances[idx[:self.n_neighbors]] y_pred.append(np.bincount(nn_labels).argmax()) return np.array(y_pred) # ID3算法 class ID3: def __init__(self, max_depth=None): self.max_depth = max_depth def _entropy(self, y): _, counts = np.unique(y, return_counts=True) p = counts / len(y) return -np.sum(p * np.log2(p)) def _information_gain(self, X, y, feature): vals, counts = np.unique(X[:, feature], return_counts=True) p = counts / len(X) ent = np.array([self._entropy(y[X[:, feature] == v]) for v in vals]) return self._entropy(y) - np.sum(p * ent) def fit(self, X, y): self.n_features_ = X.shape[1] self.tree_ = self._build_tree(X, y) def _build_tree(self, X, y, depth=0): n_samples, n_features = X.shape if len(np.unique(y)) == 1: return y[0] if self.max_depth is not None and depth >= self.max_depth: return np.bincount(y).argmax() gains = np.array([self._information_gain(X, y, f) for f in range(n_features)]) best_feature = np.argmax(gains) tree = {best_feature: {}} for val in np.unique(X[:, best_feature]): mask = X[:, best_feature] == val sub_X, sub_y = X[mask], y[mask] tree[best_feature][val] = self._build_tree(sub_X, sub_y, depth+1) return tree def predict(self, X): return np.array([self._predict_one(x, self.tree_) for x in X]) def _predict_one(self, x, tree): if isinstance(tree, int): return tree feature = list(tree.keys())[0] val = x[feature] if val not in tree[feature]: return np.random.randint(np.max(list(tree[feature].values())) + 1) sub_tree = tree[feature][val] return self._predict_one(x, sub_tree) # C4.5算法 class C45: def __init__(self, max_depth=None, min_samples_split=2): self.max_depth = max_depth self.min_samples_split = min_samples_split def _entropy(self, y): _, counts = np.unique(y, return_counts=True) p = counts / len(y) return -np.sum(p * np.log2(p)) def _information_gain_ratio(self, X, y, feature): vals, counts = np.unique(X[:, feature], return_counts=True) p = counts / len(X) ent = np.array([self._entropy(y[X[:, feature] == v]) for v in vals]) iv = -np.sum(p * np.log2(p)) return (self._entropy(y) - np.sum(p * ent)) / iv def fit(self, X, y): self.n_features_ = X.shape[1] self.tree_ = self._build_tree(X, y) def _build_tree(self, X, y, depth=0): n_samples, n_features = X.shape if len(np.unique(y)) == 1: return y[0] if self.max_depth is not None and depth >= self.max_depth: return np.bincount(y).argmax() if n_samples < self.min_samples_split: return np.bincount(y).argmax() gains = np.array([self._information_gain_ratio(X, y, f) for f in range(n_features)]) best_feature = np.argmax(gains) tree = {best_feature: {}} for val in np.unique(X[:, best_feature]): mask = X[:, best_feature] == val sub_X, sub_y = X[mask], y[mask] tree[best_feature][val] = self._build_tree(sub_X, sub_y, depth+1) return tree def predict(self, X): return np.array([self._predict_one(x, self.tree_) for x in X]) def _predict_one(self, x, tree): if isinstance(tree, int): return tree feature = list(tree.keys())[0] val = x[feature] if val not in tree[feature]: return np.random.randint(np.max(list(tree[feature].values())) + 1) sub_tree = tree[feature][val] return self._predict_one(x, sub_tree) # SVM算法 class SupportVectorMachine: def __init__(self, C=1.0, kernel='rbf', gamma='scale'): self.C = C self.kernel = kernel self.gamma = gamma def fit(self, X, y): self.svm_ = SVC(C=self.C, kernel=self.kernel, gamma=self.gamma) self.svm_.fit(X, y) def predict(self, X): return self.svm_.predict(X) # 实验 models = {'KNN': KNeighborsClassifier(n_neighbors=5), 'Pruned KNN': PrunedNearestNeighbors(n_neighbors=5), 'Compressed KNN': CompressedNearestNeighbors(n_neighbors=5), 'ID3': ID3(max_depth=5), 'C4.5': C45(max_depth=5), 'SVM': SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')} for name, model in models.items(): print(f'{name}:') model.fit(X_train, y_train) y_pred = model.predict(X_test) f1 = f1_score(y_test, y_pred, average='weighted') print(f'F1 Score: {f1:.4f}') print('\n') ``` 解释各个算法的作用与参数： 1. 近邻法：通过选择距离最近的样本来进行预测。需要设置的参数是邻居数和半径，剪辑近邻和压缩近邻是优化方法，可以减少异常值和重复样本的影响。 2. 决策树算法：通过构建决策树来进行预测。需要设置的参数是最大深度和最小样本数，ID3和C4.5是两种不同的算法，C4.5在ID3的基础上加入了信息增益比的概念。 3. SVM算法：通过找到最优的超平面来进行预测。需要设置的参数是正则化参数C和核函数的类型和参数。预测关键参数调整以后的变化趋势： 1. 近邻法：邻居数增加会使预测变得更加稳定，但也会增加时间复杂度；半径增加则会增加异常值的影响。 2. 决策树算法：最大深度增加会使模型更加复杂，可能会导致过拟合；最小样本数增加会使模型更加简单，可能会导致欠拟合。 3. SVM算法：正则化参数C增加会使模型更加关注正确分类，可能会导致过拟合；核函数的类型和参数的选择会影响模型的表现。根据不同要求修改模型结构并分析结果：可以根据具体问题的特点来选择不同的算法和参数，比如如果数据集中存在异常值，则可以选择剪辑近邻法进行预测；如果数据集较小，则可以选择决策树算法；如果数据集较大，则可以选择SVM算法。同时，也需要注意模型的泛化能力和效率，避免过拟合和欠拟合的问题。

阅读全文

相关推荐

基于iris数据集进行四种机器学习算法（决策树、朴素贝叶斯、随机森林、支持向量机SVM）的训练，使用交叉检验（Cross-val

Decision_iris_决策树，iris_决策树算法_

ID3决策树算法-iris数据集-matlab实现-决策树绘制

利用近邻法和决策树算法完成对Iris数据集的分类任务，其中近邻法需要实现剪辑近邻和压缩近邻两种优化。决策树算法需要至少实现ID3和C4.5两种。要求输出分类结果采用F1作为评价指标

采用重复剪辑近邻法提高决策树算法的性能 (2003年)

分类决策树近邻算法JAVA实现.rar_java决策树算法_tearsyeq_分类决策树近邻算法Java

代码 剪辑近邻法和压缩近邻法代码

剪辑近邻法和压缩近邻法代码.zip

K-近邻算法直播课作业iris数据集-lingyunmu-数据集

MATLAB源码集锦-剪辑近邻法和压缩近邻法代码

K近邻算法、剪辑近邻、压缩近邻等算法的matlab代码

KNN_irisKNN_iris_IRIS数据集的K近邻法分类_knniris_

基于K最近邻（KNN）的随机森林分类器 它结合了ID3决策树算法和KNN的思想，用于分类任务 ID3决策树算法与K近邻（KNN）结合的随机森林分类器 ID3决策树分类器

美赛常见参考代码;剪辑近邻法和压缩近邻法代码.zip

KNN_irisKNN_iris_IRIS数据集的K近邻法分类_knniris_源码.zip

利用k-近邻算法实现手写体分类代码及数据集

近邻法，k近邻法与剪辑近邻法

MATLAB实现KNN算法：iris数据集分类

剪辑与压缩近邻法代码实现详解

大家在看

traffic.zip

基于机器视觉的工件识别和定位文献综述.docx

基于Audiowise PAU1603的TWS蓝牙耳机方案-综合文档

【微电网优化】基于粒子群优化IEEE经典微电网结构附matlab代码.zip

SEW MDX61B 变频器IPOS配置说明PDF

最新推荐

机器学习实战 - KNN（K近邻）算法PDF知识点详解 + 代码实现

模式识别课件近邻法，k近邻法，k近邻法及最小错误率分析，快速搜索算法

WindowsQwen2.5VL环境搭建-执行脚本

VMware虚拟机安装教程

Cyclone IV硬件配置详细文档解析

【WinCC与Excel集成秘籍】：轻松搭建数据交互桥梁（必读指南）

华为模拟互联地址配置

Java游戏开发简易实现与地图控制教程

【超市销售数据深度分析】：从数据库挖掘商业价值的必经之路

在ubuntu中安装ros时出现updating datebase of manual pages...怎么解决

代码剪辑近邻法和压缩近邻法代码

基于K最近邻（KNN）的随机森林分类器它结合了ID3决策树算法和KNN的思想，用于分类任务 ID3决策树算法与K近邻（KNN）结合的随机森林分类器 ID3决策树分类器