决策树c4.5算法c语言代码
时间: 2023-11-17 10:08:04 浏览: 114
是决策树C4.5算法的C语言代码:
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#define MAX_FEATURES 100
#define MAX_SAMPLES 1000
typedef struct {
int num_samples;
int num_features;
int feature_types[MAX_FEATURES];
double samples[MAX_SAMPLES][MAX_FEATURES];
int labels[MAX_SAMPLES];
} dataset;
typedef struct node {
int feature_index;
double threshold;
struct node *left;
struct node *right;
int label;
} node;
double entropy(int *labels, int num_samples) {
double e = 0.0;
int num_classes = 0;
int *class_counts = (int *)calloc(num_samples, sizeof(int));
for (int i = 0; i < num_samples; i++) {
int found = 0;
for (int j = 0; j < num_classes; j++) {
if (labels[i] == j) {
class_counts[j]++;
found = 1;
break;
}
}
if (!found) {
class_counts[num_classes++]++;
}
}
for (int i = 0; i < num_classes; i++) {
double p = (double)class_counts[i] / num_samples;
e -= p * log2(p);
}
free(class_counts);
return e;
}
double information_gain(dataset *data, int *indices, int num_indices, int feature_index, double threshold) {
int num_samples = data->num_samples;
int *left_labels = (int *)calloc(num_samples, sizeof(int));
int *right_labels = (int *)calloc(num_samples, sizeof(int));
int left_count = 0;
int right_count = 0;
for (int i = 0; i < num_indices; i++) {
int index = indices[i];
if (data->samples[index][feature_index] < threshold) {
left_labels[left_count++] = data->labels[index];
} else {
right_labels[right_count++] = data->labels[index];
}
}
double e = entropy(data->labels, num_samples);
double left_e = entropy(left_labels, left_count);
double right_e = entropy(right_labels, right_count);
double gain = e - ((double)left_count / num_indices * left_e + (double)right_count / num_indices * right_e);
free(left_labels);
free(right_labels);
return gain;
}
int is_pure(int *labels, int num_samples) {
int label = labels[0];
for (int i = 1; i < num_samples; i++) {
if (labels[i] != label) {
return 0;
}
}
return 1;
}
int majority_vote(int *labels, int num_samples) {
int num_classes = 0;
int *class_counts = (int *)calloc(num_samples, sizeof(int));
for (int i = 0; i < num_samples; i++) {
int found = 0;
for (int j = 0; j < num_classes; j++) {
if (labels[i] == j) {
class_counts[j]++;
found = 1;
break;
}
}
if (!found) {
class_counts[num_classes++]++;
}
}
int max_count = 0;
int max_index = 0;
for (int i = 0; i < num_classes; i++) {
if (class_counts[i] > max_count) {
max_count = class_counts[i];
max_index = i;
}
}
free(class_counts);
return max_index;
}
node *build_tree(dataset *data, int *indices, int num_indices) {
if (num_indices == 0) {
return NULL;
}
int num_samples = data->num_samples;
int num_features = data->num_features;
int *labels = (int *)calloc(num_indices, sizeof(int));
for (int i = 0; i < num_indices; i++) {
labels[i] = data->labels[indices[i]];
}
if (is_pure(labels, num_indices)) {
node *leaf = (node *)malloc(sizeof(node));
leaf->feature_index = -1;
leaf->threshold = 0.0;
leaf->left = NULL;
leaf->right = NULL;
leaf->label = labels[0];
free(labels);
return leaf;
}
if (num_features == 0) {
node *leaf = (node *)malloc(sizeof(node));
leaf->feature_index = -1;
leaf->threshold = 0.0;
leaf->left = NULL;
leaf->right = NULL;
leaf->label = majority_vote(labels, num_indices);
free(labels);
return leaf;
}
int best_feature_index = -1;
double best_threshold = 0.0;
double best_gain = 0.0;
for (int i = 0; i < num_features; i++) {
if (data->feature_types[i] == 0) {
double min_value = INFINITY;
double max_value = -INFINITY;
for (int j = 0; j < num_indices; j++) {
int index = indices[j];
double value = data->samples[index][i];
if (value < min_value) {
min_value = value;
}
if (value > max_value) {
max_value = value;
}
}
double step = (max_value - min_value) / 100.0;
for (double threshold = min_value; threshold <= max_value; threshold += step) {
double gain = information_gain(data, indices, num_indices, i, threshold);
if (gain > best_gain) {
best_feature_index = i;
best_threshold = threshold;
best_gain = gain;
}
}
} else {
double gain = information_gain(data, indices, num_indices, i, 0.0);
if (gain > best_gain) {
best_feature_index = i;
best_threshold = 0.0;
best_gain = gain;
}
}
}
if (best_gain == 0.0) {
node *leaf = (node *)malloc(sizeof(node));
leaf->feature_index = -1;
leaf->threshold = 0.0;
leaf->left = NULL;
leaf->right = NULL;
leaf->label = majority_vote(labels, num_indices);
free(labels);
return leaf;
}
int *left_indices = (int *)calloc(num_indices, sizeof(int));
int *right_indices = (int *)calloc(num_indices, sizeof(int));
int left_count = 0;
int right_count = 0;
for (int i = 0; i < num_indices; i++) {
int index = indices[i];
if (data->samples[index][best_feature_index] < best_threshold) {
left_indices[left_count++] = index;
} else {
right_indices[right_count++] = index;
}
}
node *n = (node *)malloc(sizeof(node));
n->feature_index = best_feature_index;
n->threshold = best_threshold;
n->left = build_tree(data, left_indices, left_count);
n->right = build_tree(data, right_indices, right_count);
n->label = -1;
free(labels);
free(left_indices);
free(right_indices);
return n;
}
int predict(node *n, double *sample) {
if (n->feature_index == -1) {
return n->label;
}
if (n->feature_index >= 0) {
if (sample[n->feature_index] < n->threshold) {
return predict(n->left, sample);
} else {
return predict(n->right, sample);
}
}
return -1;
}
int main() {
dataset data;
data.num_samples = 14;
data.num_features = 4;
data.feature_types[0] = 0;
data.feature_types[1] = 0;
data.feature_types[2] = 0;
data.feature_types[3] = 0;
memcpy(data.samples[0], (double[]){5.1, 3.5, 1.4, 0.2}, sizeof(double) * 4);
memcpy(data.samples[1], (double[]){4.9, 3.0, 1.4, 0.2}, sizeof(double) * 4);
memcpy(data.samples[2], (double[]){4.7, 3.2, 1.3, 0.2}, sizeof(double) * 4);
memcpy(data.samples[3], (double[]){4.6, 3.1, 1.5, 0.2}, sizeof(double) * 4);
memcpy(data.samples[4], (double[]){5.0, 3.6, 1.4, 0.2}, sizeof(double) * 4);
memcpy(data.samples[5], (double[]){5.4, 3.9, 1.7, 0.4}, sizeof(double) * 4);
memcpy(data.samples[6], (double[]){4.6, 3.4, 1.4, 0.3}, sizeof(double) * 4);
memcpy(data.samples[7], (double[]){5.0, 3.4, 1.5, 0.2}, sizeof(double) * 4);
memcpy(data.samples[8], (double[]){4.4, 2.9, 1.4, 0.2}, sizeof(double) * 4);
memcpy(data.samples[9], (double[]){4.9, 3.1, 1.5, 0.1}, sizeof(double) * 4);
memcpy(data.samples[10], (double[]){5.4, 3.7, 1.5, 0.2}, sizeof(double) * 4);
memcpy(data.samples[11], (double[]){4.8, 3.4, 1.6, 0.2}, sizeof(double) * 4);
memcpy(data.samples[12], (double[]){4.8, 3.0, 1.4, 0.1}, sizeof(double) * 4);
memcpy(data.samples[13], (double[]){4.3, 3.0, 1.1, 0.1}, sizeof(double) * 4);
memcpy(data.labels, (int[]){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, sizeof(int) * 14);
int indices[14];
for (int i = 0; i < 14; i++) {
indices[i] = i;
}
node *root = build_tree(&data, indices, 14);
double sample[4] = {5.1, 3.5, 1.4, 0.2};
int label = predict(root, sample);
printf("Label: %d\n", label);
return 0;
}
```
阅读全文