用C++写一个GDBT
时间: 2023-12-27 08:43:02 浏览: 22
GDBT(Gradient Boosting Decision Tree)是一种机器学习算法。下面是一个用C++实现的简单的GDBT:
```cpp
#include <iostream>
#include <vector>
#include <cmath>
using namespace std;
class TreeNode {
public:
int feature_index;
double threshold;
double value;
TreeNode* left;
TreeNode* right;
TreeNode(int feature_index_, double threshold_, double value_) : feature_index(feature_index_), threshold(threshold_), value(value_), left(NULL), right(NULL) {}
};
class GDBT {
public:
int n_trees;
int max_depth;
double learning_rate;
vector<TreeNode*> trees;
vector<double> weights;
GDBT(int n_trees_, int max_depth_, double learning_rate_) : n_trees(n_trees_), max_depth(max_depth_), learning_rate(learning_rate_) {}
void fit(vector<vector<double>> X, vector<double> y) {
int n_samples = X.size();
int n_features = X[0].size();
vector<double> y_pred(n_samples, 0.0);
for (int i = 0; i < n_trees; i++) {
TreeNode* root = build_tree(X, y, y_pred);
trees.push_back(root);
double eps = 1e-8;
double error = 0.0;
for (int j = 0; j < n_samples; j++) {
error += pow(y[j] - y_pred[j], 2);
}
if (error < eps) {
break;
}
weights.push_back(learning_rate);
}
}
double predict(vector<double> x) {
double y_pred = 0.0;
for (int i = 0; i < trees.size(); i++) {
y_pred += weights[i] * predict_tree(trees[i], x);
}
return y_pred;
}
private:
TreeNode* build_tree(vector<vector<double>>& X, vector<double>& y, vector<double>& y_pred, int depth = 0) {
int n_samples = X.size();
int n_features = X[0].size();
double best_gain = 0.0;
int best_feature_index = -1;
double best_threshold = 0.0;
double best_value = 0.0;
double eps = 1e-8;
for (int i = 0; i < n_features; i++) {
for (int j = 0; j < n_samples; j++) {
vector<vector<double>> X_left, X_right;
vector<double> y_left, y_right;
for (int k = 0; k < n_samples; k++) {
if (X[k][i] < X[j][i]) {
X_left.push_back(X[k]);
y_left.push_back(y[k]);
} else {
X_right.push_back(X[k]);
y_right.push_back(y[k]);
}
}
if (X_left.empty() || X_right.empty()) {
continue;
}
double gain = 0.0;
double y_mean = calculate_mean(y);
double y_left_mean = calculate_mean(y_left);
double y_right_mean = calculate_mean(y_right);
gain += calculate_variance(y_left) * y_left.size() / n_samples;
gain += calculate_variance(y_right) * y_right.size() / n_samples;
gain -= calculate_variance(y) * n_samples / n_samples;
if (gain > best_gain + eps) {
best_gain = gain;
best_feature_index = i;
best_threshold = X[j][i];
best_value = (y_left_mean * y_left.size() + y_right_mean * y_right.size()) / n_samples;
}
}
}
if (best_gain < eps || depth >= max_depth) {
return new TreeNode(-1, -1.0, calculate_mean(y));
}
vector<vector<double>> X_left, X_right;
vector<double> y_left, y_right;
for (int i = 0; i < n_samples; i++) {
if (X[i][best_feature_index] < best_threshold) {
X_left.push_back(X[i]);
y_left.push_back(y[i]);
} else {
X_right.push_back(X[i]);
y_right.push_back(y[i]);
}
}
TreeNode* root = new TreeNode(best_feature_index, best_threshold, best_value);
root->left = build_tree(X_left, y_left, y_pred, depth + 1);
root->right = build_tree(X_right, y_right, y_pred, depth + 1);
for (int i = 0; i < n_samples; i++) {
if (X[i][best_feature_index] < best_threshold) {
y_pred[i] += learning_rate * best_value;
} else {
y_pred[i] -= learning_rate * best_value;
}
}
return root;
}
double predict_tree(TreeNode* root, vector<double> x) {
if (root->feature_index == -1) {
return root->value;
}
if (x[root->feature_index] < root->threshold) {
return predict_tree(root->left, x);
} else {
return predict_tree(root->right, x);
}
}
double calculate_mean(vector<double>& v) {
double sum = 0.0;
for (int i = 0; i < v.size(); i++) {
sum += v[i];
}
return sum / v.size();
}
double calculate_variance(vector<double>& v) {
double mean = calculate_mean(v);
double var = 0.0;
for (int i = 0; i < v.size(); i++) {
var += pow(v[i] - mean, 2);
}
return var / v.size();
}
};
int main() {
vector<vector<double>> X = {{1.0, 2.0}, {2.0, 1.0}, {3.0, 4.0}, {4.0, 3.0}};
vector<double> y = {1.0, 1.0, -1.0, -1.0};
GDBT model(10, 2, 0.1);
model.fit(X, y);
cout << model.predict({1.5, 2.5}) << endl;
return 0;
}
```
该实现使用了类似于CART的决策树框架,每个节点都是基于某个特征的阈值,将数据分为左右两个子集。在每个节点处,选择最优的特征和阈值来进行分裂。每棵树的预测结果被加权平均,其中权重由学习速率和树的深度来确定,这有助于提高模型的鲁棒性和泛化能力。
相关推荐
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![pdf](https://img-home.csdnimg.cn/images/20210720083512.png)
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)
![ipynb](https://img-home.csdnimg.cn/images/20210720083646.png)
![pdf](https://img-home.csdnimg.cn/images/20210720083512.png)
![rar](https://img-home.csdnimg.cn/images/20210720083606.png)
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)