scikit-learn等包辅助
时间: 2024-01-22 18:03:35 浏览: 29
是的,scikit-learn等机器学习库可以作为辅助工具来帮助我们完成机器学习任务。scikit-learn是一个开源的Python机器学习库,它提供了一系列常用的机器学习算法和工具,包括分类、回归、聚类、降维等。使用scikit-learn,我们可以方便地进行特征提取、数据预处理和模型训练等操作。
除此之外,还有其他一些常用的机器学习库,如TensorFlow、Keras、PyTorch等。这些库提供了更高级的功能,如深度学习、神经网络等,可以用于解决更复杂的机器学习问题。
相关问题
利用近邻法和决策树算法完成对Iris数据集的分类任务,其中近邻法需要实现剪辑近邻和压缩近邻两种优化。决策树算法需要至少实现ID3和C4.5两种。算上之前课程里已经要求大家做过的SVM算法,一共是至少五种算法。 实验过程要求 scikit-learn等包辅助实现 ① 代码能够输出测试集的分类结果,采用F1作为评价指标 ② 理解算法结构,能够说明各个参数的作用 ③ 能够简单预测关键参数调整以后的变化趋势 ④ 能够根据不同要求修改模型结构并分析结果 该实验的实现过程
1. 数据预处理
首先,我们需要加载Iris数据集并进行数据预处理。这包括将标签转换为数字、将数据集分成训练集和测试集,以及标准化特征值。
```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
iris = load_iris()
X = iris.data
y = iris.target
# 标签转换为数字
y[y==0] = -1
# 将数据集分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 标准化特征值
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
```
2. 剪辑近邻算法
接下来,我们实现剪辑近邻算法。在这种算法中,我们只考虑最接近的K个邻居,并删除与测试样本距离最远的邻居。
```python
import numpy as np
class ClipKNN:
def __init__(self, k=3):
self.k = k
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
y_pred = np.zeros(X_test.shape[0])
for i, x_test in enumerate(X_test):
# 计算测试样本和所有训练样本的距离
distances = np.sqrt(np.sum((self.X_train - x_test)**2, axis=1))
# 找到最近的k个邻居
k_nearest_neighbors = self.y_train[np.argsort(distances)[:self.k]]
# 计算每个类别的票数
votes = np.bincount(k_nearest_neighbors.astype('int'))
# 预测测试样本的类别
y_pred[i] = np.argmax(votes)
# 剪辑最远的邻居
if i > self.k - 1:
farthest_neighbor_index = np.argmax(distances[:i])
distances[farthest_neighbor_index] = 0
k_nearest_neighbors = self.y_train[np.argsort(distances)[:self.k]]
votes = np.bincount(k_nearest_neighbors.astype('int'))
y_pred[i] = np.argmax(votes)
return y_pred
```
3. 压缩近邻算法
接下来,我们实现压缩近邻算法。在这种算法中,我们将距离测试样本较远的邻居舍弃,并将距离测试样本较近的邻居压缩为一个虚拟样本。
```python
class CompressedKNN:
def __init__(self, k=3, alpha=0.5):
self.k = k
self.alpha = alpha
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
y_pred = np.zeros(X_test.shape[0])
for i, x_test in enumerate(X_test):
# 计算测试样本和所有训练样本的距离
distances = np.sqrt(np.sum((self.X_train - x_test)**2, axis=1))
# 压缩近邻
sorted_indices = np.argsort(distances)
compressed_indices = sorted_indices[:self.k]
compressed_distances = distances[compressed_indices]
compressed_labels = self.y_train[compressed_indices]
for j in range(self.k, len(distances)):
if distances[sorted_indices[j]] / compressed_distances[-1] <= self.alpha:
compressed_distances = np.append(compressed_distances, distances[sorted_indices[j]])
compressed_labels = np.append(compressed_labels, self.y_train[sorted_indices[j]])
k_nearest_neighbors = compressed_labels
# 计算每个类别的票数
votes = np.bincount(k_nearest_neighbors.astype('int'))
# 预测测试样本的类别
y_pred[i] = np.argmax(votes)
return y_pred
```
4. ID3算法
接下来,我们实现ID3算法。在这种算法中,我们使用信息增益来选择最佳特征,并递归地构建决策树。
```python
from collections import Counter
import math
class Node:
def __init__(self, feature_idx=None, threshold=None, label=None, left=None, right=None):
self.feature_idx = feature_idx
self.threshold = threshold
self.label = label
self.left = left
self.right = right
class ID3:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def fit(self, X_train, y_train):
self.num_classes = len(np.unique(y_train))
self.root = self.build_tree(X_train, y_train)
def build_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
num_labels = len(np.unique(y))
# 停止条件
if depth == self.max_depth or num_labels == 1 or num_samples < self.min_samples_split:
label = Counter(y).most_common(1)[0][0]
return Node(label=label)
# 选择最佳特征
best_feature_idx, best_threshold = self.choose_best_feature(X, y)
# 递归构建决策树
left_indices = X[:, best_feature_idx] < best_threshold
right_indices = X[:, best_feature_idx] >= best_threshold
left = self.build_tree(X[left_indices], y[left_indices], depth+1)
right = self.build_tree(X[right_indices], y[right_indices], depth+1)
return Node(best_feature_idx, best_threshold, left=left, right=right)
def choose_best_feature(self, X, y):
best_feature_idx, best_threshold, max_info_gain = None, None, -float('inf')
num_samples, num_features = X.shape
for feature_idx in range(num_features):
thresholds = np.unique(X[:, feature_idx])
for threshold in thresholds:
# 计算信息增益
left_indices = X[:, feature_idx] < threshold
right_indices = X[:, feature_idx] >= threshold
if sum(left_indices) == 0 or sum(right_indices) == 0:
continue
info_gain = self.calculate_info_gain(y, left_indices, right_indices)
# 更新最佳特征和阈值
if info_gain > max_info_gain:
best_feature_idx = feature_idx
best_threshold = threshold
max_info_gain = info_gain
return best_feature_idx, best_threshold
def calculate_info_gain(self, y, left_indices, right_indices):
num_samples = len(y)
left_weight = sum(left_indices) / num_samples
right_weight = sum(right_indices) / num_samples
left_entropy = self.calculate_entropy(y[left_indices])
right_entropy = self.calculate_entropy(y[right_indices])
info_gain = self.calculate_entropy(y) - left_weight * left_entropy - right_weight * right_entropy
return info_gain
def calculate_entropy(self, y):
num_samples = len(y)
_, counts = np.unique(y, return_counts=True)
probabilities = counts / num_samples
entropy = sum(probabilities * -np.log2(probabilities))
return entropy
def predict(self, X_test):
y_pred = np.zeros(X_test.shape[0])
for i, x_test in enumerate(X_test):
node = self.root
while node.label is None:
if x_test[node.feature_idx] < node.threshold:
node = node.left
else:
node = node.right
y_pred[i] = node.label
return y_pred
```
5. C4.5算法
接下来,我们实现C4.5算法。在这种算法中,我们使用信息增益比来选择最佳特征,并递归地构建决策树。
```python
class C45:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def fit(self, X_train, y_train):
self.num_classes = len(np.unique(y_train))
self.root = self.build_tree(X_train, y_train)
def build_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
num_labels = len(np.unique(y))
# 停止条件
if depth == self.max_depth or num_labels == 1 or num_samples < self.min_samples_split:
label = Counter(y).most_common(1)[0][0]
return Node(label=label)
# 选择最佳特征
best_feature_idx, best_threshold = self.choose_best_feature(X, y)
# 递归构建决策树
left_indices = X[:, best_feature_idx] < best_threshold
right_indices = X[:, best_feature_idx] >= best_threshold
left = self.build_tree(X[left_indices], y[left_indices], depth+1)
right = self.build_tree(X[right_indices], y[right_indices], depth+1)
return Node(best_feature_idx, best_threshold, left=left, right=right)
def choose_best_feature(self, X, y):
best_feature_idx, best_threshold, max_info_gain_ratio = None, None, -float('inf')
num_samples, num_features = X.shape
for feature_idx in range(num_features):
thresholds = np.unique(X[:, feature_idx])
for threshold in thresholds:
# 计算信息增益比
left_indices = X[:, feature_idx] < threshold
right_indices = X[:, feature_idx] >= threshold
if sum(left_indices) == 0 or sum(right_indices) == 0:
continue
info_gain = self.calculate_info_gain(y, left_indices, right_indices)
split_information = self.calculate_split_information(y, left_indices, right_indices)
info_gain_ratio = info_gain / split_information
# 更新最佳特征和阈值
if info_gain_ratio > max_info_gain_ratio:
best_feature_idx = feature_idx
best_threshold = threshold
max_info_gain_ratio = info_gain_ratio
return best_feature_idx, best_threshold
def calculate_info_gain(self, y, left_indices, right_indices):
num_samples = len(y)
left_weight = sum(left_indices) / num_samples
right_weight = sum(right_indices) / num_samples
left_entropy = self.calculate_entropy(y[left_indices])
right_entropy = self.calculate_entropy(y[right_indices])
info_gain = self.calculate_entropy(y) - left_weight * left_entropy - right_weight * right_entropy
return info_gain
def calculate_split_information(self, y, left_indices, right_indices):
num_samples = len(y)
left_weight = sum(left_indices) / num_samples
right_weight = sum(right_indices) / num_samples
split_information = -left_weight * np.log2(left_weight) - right_weight * np.log2(right_weight)
return split_information
def calculate_entropy(self, y):
num_samples = len(y)
_, counts = np.unique(y, return_counts=True)
probabilities = counts / num_samples
entropy = sum(probabilities * -np.log2(probabilities))
return entropy
def predict(self, X_test):
y_pred = np.zeros(X_test.shape[0])
for i, x_test in enumerate(X_test):
node = self.root
while node.label is None:
if x_test[node.feature_idx] < node.threshold:
node = node.left
else:
node = node.right
y_pred[i] = node.label
return y_pred
```
6. F1评价指标
在实现以上算法后,我们需要定义F1评价指标来衡量模型的性能。
```python
from sklearn.metrics import f1_score
def evaluate(y_true, y_pred):
return f1_score(y_true, y_pred)
```
7. 测试模型
最后,我们可以测试我们实现的算法并评估它们的性能。
```python
clip_knn = ClipKNN(k=3)
clip_knn.fit(X_train, y_train)
clip_knn_y_pred = clip_knn.predict(X_test)
print('ClipKNN F1 score:', evaluate(y_test, clip_knn_y_pred))
compressed_knn = CompressedKNN(k=3, alpha=0.5)
compressed_knn.fit(X_train, y_train)
compressed_knn_y_pred = compressed_knn.predict(X_test)
print('CompressedKNN F1 score:', evaluate(y_test, compressed_knn_y_pred))
id3 = ID3(max_depth=5, min_samples_split=2)
id3.fit(X_train, y_train)
id3_y_pred = id3.predict(X_test)
print('ID3 F1 score:', evaluate(y_test, id3_y_pred))
c45 = C45(max_depth=5, min_samples_split=2)
c45.fit(X_train, y_train)
c45_y_pred = c45.predict(X_test)
print('C45 F1 score:', evaluate(y_test, c45_y_pred))
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_y_pred = svm.predict(X_test)
print('SVM F1 score:', evaluate(y_test, svm_y_pred))
```
我们可以根据F1分数比较不同算法的性能,并根据需要调整算法的参数和结构。
利用近邻法和决策树算法完成对Iris数据集的分类任务,其中近邻法需要实现剪辑近邻和压缩近邻两种优化。决策树算法需要至少实现ID3和C4.5两种。加上SVM算法,一共是至少五种算法。给出代码,所有算法写在同一个文件里方便检查对比, 实验过程要求: 1.scikit-learn等包辅助实现 ① 代码能够输出测试集的分类结果,采用F1作为评价指标 ② 理解算法结构,能够说明各个参数的作用 ③ 能够简单预测关键参数调整以后的变化趋势 ④ 能够根据不同要求修改模型结构并分析结果
以下是Python代码实现,包含五种算法的分类任务,并输出测试集的F1值:
```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import f1_score
from sklearn.svm import SVC
# 读取Iris数据集
iris = load_iris()
X = iris.data
y = iris.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 剪辑近邻法
class PrunedNearestNeighbors(NearestNeighbors):
def __init__(self, n_neighbors=5, radius=1.0):
super(PrunedNearestNeighbors, self).__init__(n_neighbors=n_neighbors+1, radius=radius)
self.n_neighbors = n_neighbors
def predict(self, X_test, X_train, y_train):
distances, indices = self.kneighbors(X_test)
indices = indices[:, 1:] # 去掉自身
y_pred = []
for i in range(len(X_test)):
nn_labels = y_train[indices[i]]
nn_distances = distances[i, 1:]
min_distance = nn_distances.min()
if (nn_distances <= min_distance * np.sqrt(2)).sum() > self.n_neighbors:
nn_labels = nn_labels[nn_distances <= min_distance * np.sqrt(2)]
y_pred.append(np.bincount(nn_labels).argmax())
return np.array(y_pred)
# 压缩近邻法
class CompressedNearestNeighbors(NearestNeighbors):
def __init__(self, n_neighbors=5, radius=1.0):
super(CompressedNearestNeighbors, self).__init__(n_neighbors=n_neighbors+1, radius=radius)
self.n_neighbors = n_neighbors
def predict(self, X_test, X_train, y_train):
distances, indices = self.kneighbors(X_test)
indices = indices[:, 1:] # 去掉自身
y_pred = []
for i in range(len(X_test)):
nn_labels = y_train[indices[i]]
nn_distances = distances[i, 1:]
if len(nn_labels) > self.n_neighbors:
idx = np.argsort(nn_distances)
nn_labels = nn_labels[idx[:self.n_neighbors]]
nn_distances = nn_distances[idx[:self.n_neighbors]]
y_pred.append(np.bincount(nn_labels).argmax())
return np.array(y_pred)
# ID3算法
class ID3:
def __init__(self, max_depth=None):
self.max_depth = max_depth
def _entropy(self, y):
_, counts = np.unique(y, return_counts=True)
p = counts / len(y)
return -np.sum(p * np.log2(p))
def _information_gain(self, X, y, feature):
vals, counts = np.unique(X[:, feature], return_counts=True)
p = counts / len(X)
ent = np.array([self._entropy(y[X[:, feature] == v]) for v in vals])
return self._entropy(y) - np.sum(p * ent)
def fit(self, X, y):
self.n_features_ = X.shape[1]
self.tree_ = self._build_tree(X, y)
def _build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
if len(np.unique(y)) == 1:
return y[0]
if self.max_depth is not None and depth >= self.max_depth:
return np.bincount(y).argmax()
gains = np.array([self._information_gain(X, y, f) for f in range(n_features)])
best_feature = np.argmax(gains)
tree = {best_feature: {}}
for val in np.unique(X[:, best_feature]):
mask = X[:, best_feature] == val
sub_X, sub_y = X[mask], y[mask]
tree[best_feature][val] = self._build_tree(sub_X, sub_y, depth+1)
return tree
def predict(self, X):
return np.array([self._predict_one(x, self.tree_) for x in X])
def _predict_one(self, x, tree):
if isinstance(tree, int):
return tree
feature = list(tree.keys())[0]
val = x[feature]
if val not in tree[feature]:
return np.random.randint(np.max(list(tree[feature].values())) + 1)
sub_tree = tree[feature][val]
return self._predict_one(x, sub_tree)
# C4.5算法
class C45:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def _entropy(self, y):
_, counts = np.unique(y, return_counts=True)
p = counts / len(y)
return -np.sum(p * np.log2(p))
def _information_gain_ratio(self, X, y, feature):
vals, counts = np.unique(X[:, feature], return_counts=True)
p = counts / len(X)
ent = np.array([self._entropy(y[X[:, feature] == v]) for v in vals])
iv = -np.sum(p * np.log2(p))
return (self._entropy(y) - np.sum(p * ent)) / iv
def fit(self, X, y):
self.n_features_ = X.shape[1]
self.tree_ = self._build_tree(X, y)
def _build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
if len(np.unique(y)) == 1:
return y[0]
if self.max_depth is not None and depth >= self.max_depth:
return np.bincount(y).argmax()
if n_samples < self.min_samples_split:
return np.bincount(y).argmax()
gains = np.array([self._information_gain_ratio(X, y, f) for f in range(n_features)])
best_feature = np.argmax(gains)
tree = {best_feature: {}}
for val in np.unique(X[:, best_feature]):
mask = X[:, best_feature] == val
sub_X, sub_y = X[mask], y[mask]
tree[best_feature][val] = self._build_tree(sub_X, sub_y, depth+1)
return tree
def predict(self, X):
return np.array([self._predict_one(x, self.tree_) for x in X])
def _predict_one(self, x, tree):
if isinstance(tree, int):
return tree
feature = list(tree.keys())[0]
val = x[feature]
if val not in tree[feature]:
return np.random.randint(np.max(list(tree[feature].values())) + 1)
sub_tree = tree[feature][val]
return self._predict_one(x, sub_tree)
# SVM算法
class SupportVectorMachine:
def __init__(self, C=1.0, kernel='rbf', gamma='scale'):
self.C = C
self.kernel = kernel
self.gamma = gamma
def fit(self, X, y):
self.svm_ = SVC(C=self.C, kernel=self.kernel, gamma=self.gamma)
self.svm_.fit(X, y)
def predict(self, X):
return self.svm_.predict(X)
# 实验
models = {'KNN': KNeighborsClassifier(n_neighbors=5),
'Pruned KNN': PrunedNearestNeighbors(n_neighbors=5),
'Compressed KNN': CompressedNearestNeighbors(n_neighbors=5),
'ID3': ID3(max_depth=5),
'C4.5': C45(max_depth=5),
'SVM': SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale')}
for name, model in models.items():
print(f'{name}:')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1:.4f}')
print('\n')
```
解释各个算法的作用与参数:
1. 近邻法:通过选择距离最近的样本来进行预测。需要设置的参数是邻居数和半径,剪辑近邻和压缩近邻是优化方法,可以减少异常值和重复样本的影响。
2. 决策树算法:通过构建决策树来进行预测。需要设置的参数是最大深度和最小样本数,ID3和C4.5是两种不同的算法,C4.5在ID3的基础上加入了信息增益比的概念。
3. SVM算法:通过找到最优的超平面来进行预测。需要设置的参数是正则化参数C和核函数的类型和参数。
预测关键参数调整以后的变化趋势:
1. 近邻法:邻居数增加会使预测变得更加稳定,但也会增加时间复杂度;半径增加则会增加异常值的影响。
2. 决策树算法:最大深度增加会使模型更加复杂,可能会导致过拟合;最小样本数增加会使模型更加简单,可能会导致欠拟合。
3. SVM算法:正则化参数C增加会使模型更加关注正确分类,可能会导致过拟合;核函数的类型和参数的选择会影响模型的表现。
根据不同要求修改模型结构并分析结果:可以根据具体问题的特点来选择不同的算法和参数,比如如果数据集中存在异常值,则可以选择剪辑近邻法进行预测;如果数据集较小,则可以选择决策树算法;如果数据集较大,则可以选择SVM算法。同时,也需要注意模型的泛化能力和效率,避免过拟合和欠拟合的问题。