朴素贝叶斯算法c实现
时间: 2023-09-22 13:07:54 浏览: 107
以下是朴素贝叶斯算法的 C 语言实现代码:
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#define MAX_FEATURES 100
#define MAX_SAMPLES 100
typedef struct {
char label[20];
int features[MAX_FEATURES];
} Sample;
typedef struct {
int num_samples;
int num_features;
Sample samples[MAX_SAMPLES];
} Dataset;
void read_csv(const char* filename, Dataset* dataset) {
FILE* fp = fopen(filename, "r");
if (fp == NULL) {
fprintf(stderr, "Error: Failed to open file %s\n", filename);
exit(EXIT_FAILURE);
}
int num_features = 0;
char line[1024];
while (fgets(line, 1024, fp)) {
char* token;
Sample sample;
int i = 0;
token = strtok(line, ",");
strcpy(sample.label, token);
while (token = strtok(NULL, ",")) {
sample.features[i++] = atoi(token);
}
if (num_features == 0) {
num_features = i;
} else if (num_features != i) {
fprintf(stderr, "Error: All samples should have the same number of features\n");
exit(EXIT_FAILURE);
}
dataset->samples[dataset->num_samples++] = sample;
}
dataset->num_features = num_features;
fclose(fp);
}
void train(Dataset* dataset, double* prior, double** likelihood) {
int i, j, k;
int num_samples = dataset->num_samples;
int num_features = dataset->num_features;
int num_labels = 0;
char labels[MAX_SAMPLES][20];
for (i = 0; i < num_samples; ++i) {
int found = 0;
for (j = 0; j < num_labels; ++j) {
if (strcmp(dataset->samples[i].label, labels[j]) == 0) {
found = 1;
break;
}
}
if (!found) {
strcpy(labels[num_labels++], dataset->samples[i].label);
}
}
for (i = 0; i < num_labels; ++i) {
char* label = labels[i];
int count = 0;
for (j = 0; j < num_samples; ++j) {
if (strcmp(dataset->samples[j].label, label) == 0) {
++count;
}
}
prior[i] = (double)count / num_samples;
int* feature_counts = (int*)calloc(num_features, sizeof(int));
for (j = 0; j < num_samples; ++j) {
if (strcmp(dataset->samples[j].label, label) == 0) {
for (k = 0; k < num_features; ++k) {
feature_counts[k] += dataset->samples[j].features[k];
}
}
}
for (j = 0; j < num_features; ++j) {
likelihood[i][j] = ((double)feature_counts[j] + 1) / (count + 2);
}
free(feature_counts);
}
}
void predict(Dataset* dataset, double* prior, double** likelihood, char** predicted_labels) {
int i, j, k;
int num_samples = dataset->num_samples;
int num_features = dataset->num_features;
int num_labels = 0;
char labels[MAX_SAMPLES][20];
for (i = 0; i < num_samples; ++i) {
int found = 0;
for (j = 0; j < num_labels; ++j) {
if (strcmp(dataset->samples[i].label, labels[j]) == 0) {
found = 1;
break;
}
}
if (!found) {
strcpy(labels[num_labels++], dataset->samples[i].label);
}
}
for (i = 0; i < num_samples; ++i) {
double max_prob = 0.0;
char* predicted_label = NULL;
for (j = 0; j < num_labels; ++j) {
char* label = labels[j];
double prob = log(prior[j]);
for (k = 0; k < num_features; ++k) {
if (dataset->samples[i].features[k] != 0) {
prob += log(likelihood[j][k]);
} else {
prob += log(1 - likelihood[j][k]);
}
}
if (predicted_label == NULL || prob > max_prob) {
predicted_label = label;
max_prob = prob;
}
}
predicted_labels[i] = predicted_label;
}
}
void evaluate(Dataset* dataset, char** predicted_labels) {
int i, num_samples = dataset->num_samples;
int num_correct = 0;
for (i = 0; i < num_samples; ++i) {
if (strcmp(dataset->samples[i].label, predicted_labels[i]) == 0) {
++num_correct;
}
}
double accuracy = (double)num_correct / num_samples;
printf("Accuracy: %.2f%%\n", accuracy * 100);
}
int main(int argc, char** argv) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <train.csv> <test.csv>\n", argv[0]);
return EXIT_FAILURE;
}
Dataset train_dataset = {0};
read_csv(argv[1], &train_dataset);
int i, j;
double prior[MAX_SAMPLES];
double* likelihood[MAX_SAMPLES];
for (i = 0; i < MAX_SAMPLES; ++i) {
likelihood[i] = (double*)calloc(MAX_FEATURES, sizeof(double));
}
train(&train_dataset, prior, likelihood);
Dataset test_dataset = {0};
read_csv(argv[2], &test_dataset);
char* predicted_labels[MAX_SAMPLES];
for (i = 0; i < MAX_SAMPLES; ++i) {
predicted_labels[i] = (char*)calloc(20, sizeof(char));
}
predict(&test_dataset, prior, likelihood, predicted_labels);
evaluate(&test_dataset, predicted_labels);
for (i = 0; i < MAX_SAMPLES; ++i) {
free(likelihood[i]);
free(predicted_labels[i]);
}
return EXIT_SUCCESS;
}
```
该实现使用朴素贝叶斯算法对给定的数据集进行分类。实现包括读取 CSV 文件、训练模型、预测标签和评估模型等步骤。
阅读全文