朴素贝叶斯算法实现mnist数据集的读取与显示、数据集划分、算法实现的完整代码
时间: 2023-11-12 11:07:42 浏览: 64
好的,以下是用朴素贝叶斯算法实现MNIST数据集分类的完整代码:
```python
import numpy as np
import struct
import matplotlib.pyplot as plt
# 读取MNIST数据集
def load_mnist(path, kind='train'):
labels_path = path + '/' + kind + '-labels-idx1-ubyte'
images_path = path + '/' + kind + '-images-idx3-ubyte'
with open(labels_path, 'rb') as lbpath:
magic, n = struct.unpack('>II', lbpath.read(8))
labels = np.fromfile(lbpath, dtype=np.uint8)
with open(images_path, 'rb') as imgpath:
magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784)
return images, labels
# 显示MNIST数据集中的一张图片
def show_image(image):
plt.imshow(image.reshape(28, 28), cmap='gray')
plt.show()
# 划分数据集为训练集和测试集
def split_dataset(dataset, labels, test_ratio=0.2):
num_samples = dataset.shape[0]
num_test = int(num_samples * test_ratio)
indices = np.random.permutation(num_samples)
test_indices = indices[:num_test]
train_indices = indices[num_test:]
train_dataset = dataset[train_indices]
train_labels = labels[train_indices]
test_dataset = dataset[test_indices]
test_labels = labels[test_indices]
return train_dataset, train_labels, test_dataset, test_labels
# 计算高斯分布概率密度函数
def gaussian_prob(x, mean, var):
return np.exp(- (x - mean) ** 2 / (2 * var)) / np.sqrt(2 * np.pi * var)
# 训练朴素贝叶斯算法模型
def train_naive_bayes(X, y):
num_samples, num_features = X.shape
num_classes = len(np.unique(y))
priors = np.zeros(num_classes)
means = np.zeros((num_classes, num_features))
variances = np.zeros((num_classes, num_features))
for i in range(num_classes):
X_i = X[y == i]
priors[i] = len(X_i) / float(num_samples)
means[i] = np.mean(X_i, axis=0)
variances[i] = np.var(X_i, axis=0)
return priors, means, variances
# 使用朴素贝叶斯算法模型进行预测
def predict_naive_bayes(X, priors, means, variances):
num_samples, num_features = X.shape
num_classes = len(priors)
log_probs = np.zeros((num_samples, num_classes))
for i in range(num_classes):
log_probs[:, i] = np.sum(np.log(gaussian_prob(X, means[i], variances[i])), axis=1)
log_probs[:, i] += np.log(priors[i])
return np.argmax(log_probs, axis=1)
# 加载MNIST数据集
X_train, y_train = load_mnist('data', kind='train')
X_test, y_test = load_mnist('data', kind='t10k')
# 显示一张图片
show_image(X_train[0])
# 划分训练集和测试集
X_train, y_train, X_test, y_test = split_dataset(X_train, y_train, test_ratio=0.2)
# 训练朴素贝叶斯算法模型
priors, means, variances = train_naive_bayes(X_train, y_train)
# 使用朴素贝叶斯算法模型进行预测
y_pred = predict_naive_bayes(X_test, priors, means, variances)
# 计算预测精度
accuracy = np.mean(y_pred == y_test) * 100
print('Accuracy: %.2f%%' % accuracy)
```
这里采用了高斯分布作为特征的概率分布函数,实现了朴素贝叶斯算法模型的训练和预测,并计算了预测精度。
阅读全文