样本分类c++
时间: 2023-07-01 16:08:27 浏览: 86
【LIBSVM】使用C++和LIBSVM实现机器学习+样本分类
5星 · 资源好评率100%
样本分类是指将一组数据分成两个或多个类别的过程。在C++中,可以使用机器学习库或者自己编写算法来实现样本分类。
以下是一个简单的 C++ 代码示例,使用决策树算法进行样本分类:
```
#include <iostream>
#include <string>
#include <vector>
#include <map>
using namespace std;
class Sample {
public:
vector<double> features;
int label;
};
class DecisionTree {
public:
struct TreeNode {
int feature_index;
double threshold;
int label;
TreeNode* left_child;
TreeNode* right_child;
};
DecisionTree() {
root = nullptr;
}
void train(vector<Sample>& samples) {
root = build_tree(samples);
}
int predict(vector<double>& features) {
TreeNode* node = root;
while (node->left_child != nullptr && node->right_child != nullptr) {
if (features[node->feature_index] < node->threshold) {
node = node->left_child;
} else {
node = node->right_child;
}
}
return node->label;
}
private:
TreeNode* build_tree(vector<Sample>& samples) {
if (samples.empty()) {
return nullptr;
}
int num_features = samples[0].features.size();
int num_samples = samples.size();
// Check if all samples have the same label
bool same_label = true;
int label = samples[0].label;
for (int i = 1; i < num_samples; i++) {
if (samples[i].label != label) {
same_label = false;
break;
}
}
if (same_label) {
TreeNode* leaf = new TreeNode;
leaf->label = label;
return leaf;
}
// Choose the best feature to split on
double best_gain = 0.0;
int best_feature_index = -1;
double best_threshold = 0.0;
for (int i = 0; i < num_features; i++) {
vector<double> feature_values;
for (int j = 0; j < num_samples; j++) {
feature_values.push_back(samples[j].features[i]);
}
sort(feature_values.begin(), feature_values.end());
for (int j = 0; j < num_samples - 1; j++) {
double threshold = (feature_values[j] + feature_values[j+1]) / 2.0;
double gain = compute_gain(i, threshold, samples);
if (gain > best_gain) {
best_gain = gain;
best_feature_index = i;
best_threshold = threshold;
}
}
}
// Split the samples based on the best feature and threshold
vector<Sample> left_samples;
vector<Sample> right_samples;
for (int i = 0; i < num_samples; i++) {
if (samples[i].features[best_feature_index] < best_threshold) {
left_samples.push_back(samples[i]);
} else {
right_samples.push_back(samples[i]);
}
}
// Recursively build the left and right subtrees
TreeNode* node = new TreeNode;
node->feature_index = best_feature_index;
node->threshold = best_threshold;
node->left_child = build_tree(left_samples);
node->right_child = build_tree(right_samples);
return node;
}
double compute_gain(int feature_index, double threshold, vector<Sample>& samples) {
int num_samples = samples.size();
int num_left = 0;
int num_right = 0;
map<int, int> left_label_counts;
map<int, int> right_label_counts;
for (int i = 0; i < num_samples; i++) {
if (samples[i].features[feature_index] < threshold) {
num_left++;
left_label_counts[samples[i].label]++;
} else {
num_right++;
right_label_counts[samples[i].label]++;
}
}
double entropy_left = compute_entropy(left_label_counts, num_left);
double entropy_right = compute_entropy(right_label_counts, num_right);
double entropy_combined = entropy_left * num_left / num_samples + entropy_right * num_right / num_samples;
double gain = compute_entropy(samples) - entropy_combined;
return gain;
}
double compute_entropy(map<int, int>& label_counts, int num_samples) {
double entropy = 0.0;
for (auto& p : label_counts) {
double prob = static_cast<double>(p.second) / num_samples;
entropy -= prob * log2(prob);
}
return entropy;
}
double compute_entropy(vector<Sample>& samples) {
int num_samples = samples.size();
map<int, int> label_counts;
for (int i = 0; i < num_samples; i++) {
label_counts[samples[i].label]++;
}
return compute_entropy(label_counts, num_samples);
}
TreeNode* root;
};
int main() {
// Create some sample data
vector<Sample> samples;
Sample s;
s.features = {1.0, 2.0};
s.label = 0;
samples.push_back(s);
s.features = {2.0, 1.0};
s.label = 1;
samples.push_back(s);
s.features = {3.0, 4.0};
s.label = 0;
samples.push_back(s);
s.features = {4.0, 3.0};
s.label = 1;
samples.push_back(s);
// Train a decision tree
DecisionTree dt;
dt.train(samples);
// Make some predictions
vector<double> features = {1.5, 1.5};
int label = dt.predict(features);
cout << "Predicted label: " << label << endl;
features = {3.5, 3.5};
label = dt.predict(features);
cout << "Predicted label: " << label << endl;
return 0;
}
```
在这个示例中,我们使用了一个简单的决策树算法来对样本进行分类。我们首先定义了一个 `Sample` 类来表示每个样本,其中包括一组特征以及一个标签。我们还定义了一个 `DecisionTree` 类来实现决策树算法。在 `train` 方法中,我们将样本数据传递给 `build_tree` 方法来构建决策树。在 `predict` 方法中,我们遍历决策树来预测新样本的标签。
决策树算法的核心是 `build_tree` 方法,它递归地构建决策树。在每个节点上,我们选择最佳的特征和阈值来划分样本,然后递归地构建左子树和右子树。在选择最佳特征和阈值时,我们使用信息增益来衡量划分的好坏。信息增益越大,说明划分越好。
以上是一个简单的 C++ 样本分类示例,实际应用中可能需要更复杂的算法和数据结构来处理更大规模的数据。
阅读全文