西瓜数据集决策树c++
时间: 2023-06-23 17:08:57 浏览: 198
基于西瓜数据集的决策树实现.zip
以下是使用C++实现决策树算法,基于西瓜数据集:
```c++
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <cmath>
#include <algorithm>
using namespace std;
// 西瓜数据集的条目
struct Record {
int id;
double feature1;
double feature2;
string label;
};
// 决策树的节点
struct Node {
int id;
vector<Record> data;
int feature_index;
double threshold;
string label;
Node* left;
Node* right;
};
// 读取数据集
vector<Record> read_data(string filename) {
ifstream file(filename);
vector<Record> data;
string line;
getline(file, line); // 跳过第一行(表头)
while (getline(file, line)) {
Record record;
sscanf(line.c_str(), "%d,%lf,%lf,%s", &record.id, &record.feature1, &record.feature2, &record.label[0]);
data.push_back(record);
}
return data;
}
// 计算数据集的信息熵
double entropy(vector<Record> data) {
int n = data.size();
if (n == 0) {
return 0.0;
}
int count1 = 0, count2 = 0;
for (int i = 0; i < n; i++) {
if (data[i].label == "是") {
count1++;
} else {
count2++;
}
}
double p1 = (double) count1 / n;
double p2 = (double) count2 / n;
if (p1 == 0.0 || p2 == 0.0) {
return 0.0;
} else {
return -p1 * log2(p1) - p2 * log2(p2);
}
}
// 计算数据集在某个特征上的加权信息熵
double weighted_entropy(vector<Record> data, int feature_index, double threshold, vector<Record>& left_data, vector<Record>& right_data) {
int n = data.size();
int left_count = 0, right_count = 0;
for (int i = 0; i < n; i++) {
if (data[i].feature1 < threshold) {
left_count++;
left_data.push_back(data[i]);
} else {
right_count++;
right_data.push_back(data[i]);
}
}
double p1 = (double) left_count / n;
double p2 = (double) right_count / n;
return p1 * entropy(left_data) + p2 * entropy(right_data);
}
// 选择最优的特征和阈值,返回信息增益和阈值
double choose_best_feature(vector<Record> data, int& feature_index, double& threshold) {
int n = data.size();
double info_gain = 0.0;
for (int i = 0; i < n; i++) {
double t = data[i].feature1;
vector<Record> left_data, right_data;
double w_ent = weighted_entropy(data, 0, t, left_data, right_data);
double ig = entropy(data) - w_ent;
if (ig > info_gain) {
info_gain = ig;
feature_index = 0;
threshold = t;
}
}
return info_gain;
}
// 创建决策树
Node* create_decision_tree(vector<Record> data) {
if (data.size() == 0) {
return NULL;
}
Node* node = new Node;
node->id = data[0].id;
node->label = data[0].label;
bool same_label = true;
for (int i = 1; i < data.size(); i++) {
if (data[i].label != data[0].label) {
same_label = false;
break;
}
}
if (same_label) {
return node;
}
int feature_index;
double threshold;
double info_gain = choose_best_feature(data, feature_index, threshold);
if (info_gain == 0.0) {
return node;
}
node->feature_index = feature_index;
node->threshold = threshold;
vector<Record> left_data, right_data;
weighted_entropy(data, feature_index, threshold, left_data, right_data);
node->left = create_decision_tree(left_data);
node->right = create_decision_tree(right_data);
return node;
}
// 预测数据的类别
string predict(Node* node, Record record) {
if (node->left == NULL && node->right == NULL) {
return node->label;
}
if (record.feature1 < node->threshold) {
return predict(node->left, record);
} else {
return predict(node->right, record);
}
}
int main() {
vector<Record> data = read_data("watermelon.csv");
Node* root = create_decision_tree(data);
Record record;
record.id = 18;
record.feature1 = 0.697;
record.feature2 = 0.460;
cout << predict(root, record) << endl;
return 0;
}
```
上述代码中使用了信息熵和信息增益来选择最优的特征和阈值,采用递归方式创建决策树,也实现了预测数据的类别。
阅读全文