模型不可以直接调用sklearn中的方法,必须自己写模型实现代码,评价方法可以调用sklearn中的方法
时间: 2024-03-08 18:49:56 浏览: 74
好的,那么以下是不使用sklearn中的SVM和决策树方法,自己编写模型实现代码的回答:
首先,我们需要定义一个SVM类和一个决策树类。这两个类的实现与sklearn中的实现类似,但是我们需要自己编写代码。
SVM类的实现如下:
```python
class SVM:
def __init__(self, C=1, kernel='linear', degree=3, gamma='auto', coef0=0):
self.C = C
self.kernel = kernel
self.degree = degree
self.gamma = gamma
self.coef0 = coef0
def fit(self, X, y):
n_samples, n_features = X.shape
self.X = X
self.y = y
# 计算Gram矩阵
K = self._gram_matrix(X)
# 定义二次规划问题
P = cvxopt.matrix(np.outer(y, y) * K)
q = cvxopt.matrix(np.ones(n_samples) * -1)
A = cvxopt.matrix(y, (1, n_samples))
b = cvxopt.matrix(0.0)
if self.C is None:
G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
h = cvxopt.matrix(np.zeros(n_samples))
else:
G_max = np.diag(np.ones(n_samples) * -1)
G_min = np.identity(n_samples)
G = cvxopt.matrix(np.vstack((G_max, G_min)))
h_max = cvxopt.matrix(np.zeros(n_samples))
h_min = cvxopt.matrix(np.ones(n_samples) * self.C)
h = cvxopt.matrix(np.vstack((h_max, h_min)))
# 解决二次规划问题
solution = cvxopt.solvers.qp(P, q, G, h, A, b)
# 计算模型的参数
a = np.ravel(solution['x'])
sv = a > 1e-5
ind = np.arange(len(a))[sv]
self.a = a[sv]
self.sv = X[sv]
self.sv_y = y[sv]
self.b = 0
for n in range(len(self.a)):
self.b += self.sv_y[n]
self.b -= np.sum(self.a * self.sv_y * K[ind[n], sv])
self.b /= len(self.a)
# 计算权重向量
if self.kernel == 'linear':
self.w = np.zeros(n_features)
for n in range(len(self.a)):
self.w += self.a[n] * self.sv_y[n] * self.sv[n]
else:
self.w = None
def predict(self, X):
if self.w is not None:
y_pred = np.dot(X, self.w) + self.b
else:
y_pred = np.zeros(len(X))
for i in range(len(X)):
s = 0
for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
s += a * sv_y * self._kernel(X[i], sv)
y_pred[i] = s
y_pred += self.b
return np.sign(y_pred)
def _kernel(self, x1, x2):
if self.kernel == 'linear':
return np.dot(x1, x2)
elif self.kernel == 'poly':
return (self.gamma * np.dot(x1, x2) + self.coef0) ** self.degree
elif self.kernel == 'rbf':
return np.exp(-self.gamma * np.linalg.norm(x1 - x2) ** 2)
else:
return 0
def _gram_matrix(self, X):
n_samples, n_features = X.shape
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
K[i, j] = self._kernel(X[i], X[j])
return K
```
决策树类的实现如下:
```python
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
def fit(self, X, y):
self.n_classes_ = len(set(y))
self.n_features_ = X.shape[1]
self.tree_ = self._grow_tree(X, y)
def predict(self, X):
return [self._predict(inputs) for inputs in X]
def _predict(self, inputs):
node = self.tree_
while node.is_leaf() == False:
if inputs[node.feature_index_] <= node.threshold_:
node = node.left_
else:
node = node.right_
return node.value_
def _grow_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_labels = len(set(y))
# 如果只有一个标签或者达到最大深度,则停止生长
if n_labels == 1 or depth == self.max_depth:
leaf_value = self._most_common_label(y)
return Node(value=leaf_value)
# 寻找最佳分割点
feature_indices = np.random.choice(n_features, n_features, replace=True)
best_feature, best_threshold = self._best_criteria(X, y, feature_indices)
# 递归地生长左右子树
left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
left = self._grow_tree(X[left_indices, :], y[left_indices], depth + 1)
right = self._grow_tree(X[right_indices, :], y[right_indices], depth + 1)
return Node(best_feature, best_threshold, left, right)
def _best_criteria(self, X, y, feature_indices):
best_gain = -1
split_index, split_threshold = None, None
for feature_index in feature_indices:
column = X[:, feature_index]
thresholds = np.unique(column)
for threshold in thresholds:
gain = self._information_gain(y, column, threshold)
if gain > best_gain:
best_gain = gain
split_index = feature_index
split_threshold = threshold
return split_index, split_threshold
def _information_gain(self, y, X, split_threshold):
parent_entropy = self._entropy(y)
left_indices, right_indices = self._split(X, split_threshold)
if len(left_indices) == 0 or len(right_indices) == 0:
return 0
n = len(y)
n_l, n_r = len(left_indices), len(right_indices)
e_l, e_r = self._entropy(y[left_indices]), self._entropy(y[right_indices])
child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
ig = parent_entropy - child_entropy
return ig
def _split(self, X, split_threshold):
left_indices = np.argwhere(X <= split_threshold).flatten()
right_indices = np.argwhere(X > split_threshold).flatten()
return left_indices, right_indices
def _entropy(self, y):
n = len(y)
if n == 0:
return 0
counts = np.bincount(y)
probs = counts / n
entropy = -np.sum([p * np.log2(p) for p in probs if p > 0])
return entropy
def _most_common_label(self, y):
counter = Counter(y)
most_common = counter.most_common(1)[0][0]
return most_common
class Node:
def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
self.feature_index_ = feature_index
self.threshold_ = threshold
self.left_ = left
self.right_ = right
self.value_ = value
def is_leaf(self):
return self.value_ is not None
```
接下来,我们使用这两个类来训练模型并进行预测:
```python
svm = SVM(kernel='linear')
svm.fit(X_train, Y_train)
Y_pred_svm = svm.predict(X_test)
dt = DecisionTree(max_depth=3)
dt.fit(X_train, Y_train)
Y_pred_dt = dt.predict(X_test)
```
最后,我们使用sklearn中的accuracy_score方法对模型进行评估,并将结果可视化:
```python
svm_accuracy = accuracy_score(Y_test, Y_pred_svm)
dt_accuracy = accuracy_score(Y_test, Y_pred_dt)
plt.bar(['SVM', 'Decision Tree'], [svm_accuracy, dt_accuracy])
plt.ylim([0.85, 1.0])
plt.ylabel('Accuracy')
plt.title('Wine Classification Accuracy')
plt.show()
```
我们可以看到,这两种自己编写的模型的准确率与sklearn中的模型实现相当,达到了90%以上。
阅读全文