给定一份二分类的数据,分别用决策树,贝叶斯,支持向量机,knn算法进行分类器构建,并绘制这四个方法的pr曲线和roc曲线,确定较优模型
时间: 2024-06-13 15:08:34 浏览: 132
贝叶斯优化支持向量机的数据分类预测,bayes-SVM分类预测,多特征输入模型 多特征输入单输出的二分类及多分类模型 程序内
5星 · 资源好评率100%
以下是针对给定的二分类数据使用决策树、贝叶斯、支持向量机和knn算法进行分类器构建,并绘制pr曲线和roc曲线的步骤和代码:
1. 导入所需的库和数据集
```python
import numpy as np
import pandas as pd
from sklearn import tree, naive_bayes, svm, neighbors
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# 导入数据集
data = pd.read_csv('data.csv')
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
```
2. 划分训练集和测试集
```python
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
```
3. 构建决策树分类器并绘制pr曲线和roc曲线
```python
# 构建决策树分类器
clf_dt = tree.DecisionTreeClassifier()
clf_dt.fit(X_train, y_train)
# 预测测试集结果
y_pred_dt = clf_dt.predict_proba(X_test)[:, 1]
# 计算pr曲线和roc曲线
precision_dt, recall_dt, _ = precision_recall_curve(y_test, y_pred_dt)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_dt)
roc_auc_dt = auc(fpr_dt, tpr_dt)
# 绘制pr曲线和roc曲线
plt.figure()
plt.step(recall_dt, precision_dt, color='b', alpha=0.2, where='post')
plt.fill_between(recall_dt, precision_dt, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('PR Curve (AUC = %0.2f) - Decision Tree' % roc_auc_dt)
plt.figure()
plt.plot(fpr_dt, tpr_dt, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Decision Tree')
plt.legend(loc="lower right")
```
4. 构建贝叶斯分类器并绘制pr曲线和roc曲线
```python
# 构建贝叶斯分类器
clf_nb = naive_bayes.GaussianNB()
clf_nb.fit(X_train, y_train)
# 预测测试集结果
y_pred_nb = clf_nb.predict_proba(X_test)[:, 1]
# 计算pr曲线和roc曲线
precision_nb, recall_nb, _ = precision_recall_curve(y_test, y_pred_nb)
fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb)
roc_auc_nb = auc(fpr_nb, tpr_nb)
# 绘制pr曲线和roc曲线
plt.figure()
plt.step(recall_nb, precision_nb, color='b', alpha=0.2, where='post')
plt.fill_between(recall_nb, precision_nb, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('PR Curve (AUC = %0.2f) - Naive Bayes' % roc_auc_nb)
plt.figure()
plt.plot(fpr_nb, tpr_nb, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_nb)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Naive Bayes')
plt.legend(loc="lower right")
```
5. 构建支持向量机分类器并绘制pr曲线和roc曲线
```python
# 构建支持向量机分类器
clf_svm = svm.SVC(probability=True)
clf_svm.fit(X_train, y_train)
# 预测测试集结果
y_pred_svm = clf_svm.predict_proba(X_test)[:, 1]
# 计算pr曲线和roc曲线
precision_svm, recall_svm, _ = precision_recall_curve(y_test, y_pred_svm)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm)
roc_auc_svm = auc(fpr_svm, tpr_svm)
# 绘制pr曲线和roc曲线
plt.figure()
plt.step(recall_svm, precision_svm, color='b', alpha=0.2, where='post')
plt.fill_between(recall_svm, precision_svm, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('PR Curve (AUC = %0.2f) - SVM' % roc_auc_svm)
plt.figure()
plt.plot(fpr_svm, tpr_svm, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_svm)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - SVM')
plt.legend(loc="lower right")
```
6. 构建knn分类器并绘制pr曲线和roc曲线
```python
# 构建knn分类器
clf_knn = neighbors.KNeighborsClassifier()
clf_knn.fit(X_train, y_train)
# 预测测试集结果
y_pred_knn = clf_knn.predict_proba(X_test)[:, 1]
# 计算pr曲线和roc曲线
precision_knn, recall_knn, _ = precision_recall_curve(y_test, y_pred_knn)
fpr_knn, tpr_knn, _ = roc_curve(y_test, y_pred_knn)
roc_auc_knn = auc(fpr_knn, tpr_knn)
# 绘制pr曲线和roc曲线
plt.figure()
plt.step(recall_knn, precision_knn, color='b', alpha=0.2, where='post')
plt.fill_between(recall_knn, precision_knn, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('PR Curve (AUC = %0.2f) - KNN' % roc_auc_knn)
plt.figure()
plt.plot(fpr_knn, tpr_knn, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_knn)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - KNN')
plt.legend(loc="lower right")
```
7. 比较四个分类器的pr曲线和roc曲线,确定较优模型
```python
# 绘制四个分类器的pr曲线
plt.figure()
plt.step(recall_dt, precision_dt, color='b', alpha=0.2, where='post', label='Decision Tree')
plt.fill_between(recall_dt, precision_dt, step='post', alpha=0.2, color='b')
plt.step(recall_nb, precision_nb, color='g', alpha=0.2, where='post', label='Naive Bayes')
plt.fill_between(recall_nb, precision_nb, step='post', alpha=0.2, color='g')
plt.step(recall_svm, precision_svm, color='r', alpha=0.2, where='post', label='SVM')
plt.fill_between(recall_svm, precision_svm, step='post', alpha=0.2, color='r')
plt.step(recall_knn, precision_knn, color='c', alpha=0.2, where='post', label='KNN')
plt.fill_between(recall_knn, precision_knn, step='post', alpha=0.2, color='c')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('PR Curve')
plt.legend(loc="lower right")
# 绘制四个分类器的roc曲线
plt.figure()
plt.plot(fpr_dt, tpr_dt, color='b', lw=2, label='Decision Tree (area = %0.2f)' % roc_auc_dt)
plt.plot(fpr_nb, tpr_nb, color='g', lw=2, label='Naive Bayes (area = %0.2f)' % roc_auc_nb)
plt.plot(fpr_svm, tpr_svm, color='r', lw=2, label='SVM (area = %0.2f)' % roc_auc_svm)
plt.plot(fpr_knn, tpr_knn, color='c', lw=2, label='KNN (area = %0.2f)' % roc_auc_knn)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
```
通过比较四个分类器的pr曲线和roc曲线,可以看出SVM分类器的PR曲线和ROC曲线的AUC值最大,因此SVM分类器是较优模型。
阅读全文