class KnnRegressorCV: def __init__(self, ks=list(range(1, 21)), cv=LFold(5)): self.ks = ks self.cv = cv # YOUR CODE HERE def fit(self, x, y): df = pd.DataFrame(columns=["KValue", "LFoldID", "MSE_train", "MSE_test"]) count = 0 for k in self.ks: for index, each in enumerate(self.cv.split(x, y)): mse_train, mse_test = runknn(k, each[0], each[1], each[2].reshape(-1), each[3].reshape(-1)) df.at[count, "KValue"] = k df.at[count, "LFoldID"] = index df.at[count, "MSE_train"] = mse_train df.at[count, "MSE_test"] = mse_test count += 1 self.bestK = find_best_k(df, self.ks) self.y_train_ = y self.x_train_kdtree_ = KDTree(x) print("bestK internal:", self.bestK) return self # YOUR CODE HERE def predict(self, x): _, neighbours = self.x_train_kdtree_.query(x, k=self.bestK) neighbours = neighbours.reshape(len(x), self.k) neighbour_labels = self.y_train_[neighbours] pred = np.mean(neighbour_labels, axis=1) return pred给这段代码加注释
时间: 2023-11-26 18:03:19 浏览: 77
Python RuntimeError: thread.__init__() not called解决方法
```
# 导入需要的库
import pandas as pd
from sklearn.neighbors import KDTree
# 定义 KNN 回归交叉验证类
class KnnRegressorCV:
def __init__(self, ks=list(range(1, 21)), cv=LFold(5)): # 初始化方法,参数包括 k 值列表和交叉验证方法
self.ks = ks # k 值列表
self.cv = cv # 交叉验证方法
def fit(self, x, y): # 模型训练方法,参数包括自变量和因变量
# 定义空的数据框用于存储模型评估结果
df = pd.DataFrame(columns=["KValue", "LFoldID", "MSE_train", "MSE_test"])
count = 0 # 定义计数器
for k in self.ks: # 遍历 k 值列表
for index, each in enumerate(self.cv.split(x, y)): # 遍历交叉验证的训练集和测试集
# 调用 runknn 函数进行模型评估,返回训练集和测试集的 MSE
mse_train, mse_test = runknn(k, each[0], each[1], each[2].reshape(-1), each[3].reshape(-1))
# 将评估结果存入数据框
df.at[count, "KValue"] = k
df.at[count, "LFoldID"] = index
df.at[count, "MSE_train"] = mse_train
df.at[count, "MSE_test"] = mse_test
count += 1 # 计数器加 1
# 调用 find_best_k 函数找到 MSE 最小的 k 值
self.bestK = find_best_k(df, self.ks)
self.y_train_ = y # 存储因变量
self.x_train_kdtree_ = KDTree(x) # 存储自变量的 KDTree
print("bestK internal:", self.bestK) # 打印最优的 k 值
return self # 返回模型自身
def predict(self, x): # 预测方法,参数为自变量
_, neighbours = self.x_train_kdtree_.query(x, k=self.bestK) # 找到最近的 k 个邻居
neighbours = neighbours.reshape(len(x), self.k) # 将邻居的索引转为二维数组
neighbour_labels = self.y_train_[neighbours] # 找到邻居的因变量值
pred = np.mean(neighbour_labels, axis=1) # 对邻居因变量值求平均,作为预测值
return pred # 返回预测值
```
阅读全文