利用近邻法和决策树算法完成对Iris数据集的分类任务,其中近邻法需要实现剪辑近邻和压缩近邻两种优化。决策树算法需要至少实现ID3和C4.5两种。加上SVM算法,一共是至少五种算法。给出代码,所有算法写在同一个文件里方便检查对比, 实验过程要求: 1.scikit-learn等包辅助实现 ① 代码能够输出测试集的分类结果,采用F1作为评价指标 ② 理解算法结构,能够说明各个参数的作用 ③ 能够简单预测关键参数调整以后的变化趋势 ④ 能够根据不同要求修改模型结构并分析结果
时间: 2024-01-21 12:20:05 浏览: 85
代码 剪辑近邻法和压缩近邻法代码
5星 · 资源好评率100%
以下是Python代码实现,包含五种算法的分类任务,并输出测试集的F1值:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import f1_score
from sklearn.svm import SVC
# 读取Iris数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 剪辑近邻法
class PrunedNearestNeighbors(NearestNeighbors):
def __init__(self, n_neighbors=5, radius=1.0):
super(PrunedNearestNeighbors, self).__init__(n_neighbors=n_neighbors+1, radius=radius)
self.n_neighbors = n_neighbors
def predict(self, X_test, X_train, y_train):
distances, indices = self.kneighbors(X_test)
indices = indices[:, 1:] # 去掉自身
y_pred = []
for i in range(len(X_test)):
nn_labels = y_train[indices[i]]
nn_distances = distances[i, 1:]
min_distance = nn_distances.min()
if (nn_distances <= min_distance * np.sqrt(2)).sum() > self.n_neighbors:
nn_labels = nn_labels[nn_distances <= min_distance * np.sqrt(2)]
y_pred.append(np.bincount(nn_labels).argmax())
return np.array(y_pred)
# 压缩近邻法
class CompressedNearestNeighbors(NearestNeighbors):
def __init__(self, n_neighbors=5, radius=1.0):
super(CompressedNearestNeighbors, self).__init__(n_neighbors=n_neighbors+1, radius=radius)
self.n_neighbors = n_neighbors
def predict(self, X_test, X_train, y_train):
distances, indices = self.kneighbors(X_test)
indices = indices[:, 1:] # 去掉自身
y_pred = []
for i in range(len(X_test)):
nn_labels = y_train[indices[i]]
nn_distances = distances[i, 1:]
if len(nn_labels) > self.n_neighbors:
idx = np.argsort(nn_distances)
nn_labels = nn_labels[idx[:self.n_neighbors]]
nn_distances = nn_distances[idx[:self.n_neighbors]]
y_pred.append(np.bincount(nn_labels).argmax())
return np.array(y_pred)
# ID3算法
class ID3:
def __init__(self, max_depth=None):
self.max_depth = max_depth
def _entropy(self, y):
_, counts = np.unique(y, return_counts=True)
p = counts / len(y)
return -np.sum(p * np.log2(p))
def _information_gain(self, X, y, feature):
vals, counts = np.unique(X[:, feature], return_counts=True)
p = counts / len(X)
ent = np.array([self._entropy(y[X[:, feature] == v]) for v in vals])
return self._entropy(y) - np.sum(p * ent)
def fit(self, X, y):
self.n_features_ = X.shape[1]
self.tree_ = self._build_tree(X, y)
def _build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
if len(np.unique(y)) == 1:
return y[0]
if self.max_depth is not None and depth >= self.max_depth:
return np.bincount(y).argmax()
gains = np.array([self._information_gain(X, y, f) for f in range(n_features)])
best_feature = np.argmax(gains)
tree = {best_feature: {}}
for val in np.unique(X[:, best_feature]):
mask = X[:, best_feature] == val
sub_X, sub_y = X[mask], y[mask]
tree[best_feature][val] = self._build_tree(sub_X, sub_y, depth+1)
return tree
def predict(self, X):
return np.array([self._predict_one(x, self.tree_) for x in X])
def _predict_one(self, x, tree):
if isinstance(tree, int):
return tree
feature = list(tree.keys())[0]
val = x[feature]
if val not in tree[feature]:
return np.random.randint(np.max(list(tree[feature].values())) + 1)
sub_tree = tree[feature][val]
return self._predict_one(x, sub_tree)
# C4.5算法
class C45:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def _entropy(self, y):
_, counts = np.unique(y, return_counts=True)
p = counts / len(y)
return -np.sum(p * np.log2(p))
def _information_gain_ratio(self, X, y, feature):
vals, counts = np.unique(X[:, feature], return_counts=True)
p = counts / len(X)
ent = np.array([self._entropy(y[X[:, feature] == v]) for v in vals])
iv = -np.sum(p * np.log2(p))
return (self._entropy(y) - np.sum(p * ent)) / iv
def fit(self, X, y):
self.n_features_ = X.shape[1]
self.tree_ = self._build_tree(X, y)
def _build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
if len(np.unique(y)) == 1:
return y[0]
if self.max_depth is not None and depth >= self.max_depth:
return np.bincount(y).argmax()
if n_samples < self.min_samples_split:
return np.bincount(y).argmax()
gains = np.array([self._information_gain_ratio(X, y, f) for f in range(n_features)])
best_feature = np.argmax(gains)
tree = {best_feature: {}}
for val in np.unique(X[:, best_feature]):
mask = X[:, best_feature] == val
sub_X, sub_y = X[mask], y[mask]
tree[best_feature][val] = self._build_tree(sub_X, sub_y, depth+1)
return tree
def predict(self, X):
return np.array([self._predict_one(x, self.tree_) for x in X])
def _predict_one(self, x, tree):
if isinstance(tree, int):
return tree
feature = list(tree.keys())[0]
val = x[feature]
if val not in tree[feature]:
return np.random.randint(np.max(list(tree[feature].values())) + 1)
sub_tree = tree[feature][val]
return self._predict_one(x, sub_tree)
# SVM算法
class SupportVectorMachine:
def __init__(self, C=1.0, kernel='rbf', gamma='scale'):
self.C = C
self.kernel = kernel
self.gamma = gamma
def fit(self, X, y):
self.svm_ = SVC(C=self.C, kernel=self.kernel, gamma=self.gamma)
self.svm_.fit(X, y)
def predict(self, X):
return self.svm_.predict(X)
# 实验
models = {'KNN': KNeighborsClassifier(n_neighbors=5),
'Pruned KNN': PrunedNearestNeighbors(n_neighbors=5),
'Compressed KNN': CompressedNearestNeighbors(n_neighbors=5),
'ID3': ID3(max_depth=5),
'C4.5': C45(max_depth=5),
'SVM': SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')}
for name, model in models.items():
print(f'{name}:')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1:.4f}')
print('\n')
```
解释各个算法的作用与参数:
1. 近邻法:通过选择距离最近的样本来进行预测。需要设置的参数是邻居数和半径,剪辑近邻和压缩近邻是优化方法,可以减少异常值和重复样本的影响。
2. 决策树算法:通过构建决策树来进行预测。需要设置的参数是最大深度和最小样本数,ID3和C4.5是两种不同的算法,C4.5在ID3的基础上加入了信息增益比的概念。
3. SVM算法:通过找到最优的超平面来进行预测。需要设置的参数是正则化参数C和核函数的类型和参数。
预测关键参数调整以后的变化趋势:
1. 近邻法:邻居数增加会使预测变得更加稳定,但也会增加时间复杂度;半径增加则会增加异常值的影响。
2. 决策树算法:最大深度增加会使模型更加复杂,可能会导致过拟合;最小样本数增加会使模型更加简单,可能会导致欠拟合。
3. SVM算法:正则化参数C增加会使模型更加关注正确分类,可能会导致过拟合;核函数的类型和参数的选择会影响模型的表现。
根据不同要求修改模型结构并分析结果:可以根据具体问题的特点来选择不同的算法和参数,比如如果数据集中存在异常值,则可以选择剪辑近邻法进行预测;如果数据集较小,则可以选择决策树算法;如果数据集较大,则可以选择SVM算法。同时,也需要注意模型的泛化能力和效率,避免过拟合和欠拟合的问题。
阅读全文