基于卷积神经网络对文本数据集进行二分类,文本为两列分别为lebal和review,lebal为标签列,reveiw为内容列,对标签列进行读取,标签分别为0和1共两类,随机将两类别的各80%作为训练集,20%作为训练集,建立一个传统卷积神经网络,卷积核大小为3,池化层为平均池化,训练轮数为50,输出训练集和测试集每一轮的准确率和损失率并作一个折线图,基于PYtorch建立模型,给出详细的代码
时间: 2023-11-15 08:06:25 浏览: 140
好的,以下是一个基于PyTorch的传统卷积神经网络进行文本分类的示例代码:
```python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
# 定义数据集类
class TextDataset(Dataset):
def __init__(self, reviews, labels):
self.reviews = reviews
self.labels = labels
def __len__(self):
return len(self.reviews)
def __getitem__(self, idx):
return self.reviews[idx], self.labels[idx]
# 定义卷积神经网络模型
class CNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.convs = nn.ModuleList([
nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes
])
self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.embedding(x)
x = x.unsqueeze(1)
x = [nn.functional.avg_pool2d(nn.functional.relu(conv(x)), (x.shape[2], 1)).squeeze(3) for conv in self.convs]
x = torch.cat(x, dim=1)
x = self.dropout(x)
x = self.fc(x)
return x.squeeze(1)
# 定义训练函数
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for texts, labels in iterator:
optimizer.zero_grad()
predictions = model(texts)
loss = criterion(predictions, labels.float())
acc = binary_accuracy(predictions, labels.float())
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
# 定义测试函数
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
preds = []
labels = []
model.eval()
with torch.no_grad():
for texts, batch_labels in iterator:
predictions = model(texts)
loss = criterion(predictions, batch_labels.float())
acc = binary_accuracy(predictions, batch_labels.float())
epoch_loss += loss.item()
epoch_acc += acc.item()
preds += [1 if p > 0.5 else 0 for p in predictions.tolist()]
labels += batch_labels.tolist()
return epoch_loss / len(iterator), epoch_acc / len(iterator), preds, labels
# 定义计算二元准确率的函数
def binary_accuracy(preds, y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
# 读取数据集
data = pd.read_csv('data.csv')
reviews = data['review'].values
labels = data['label'].values
# 将数据集划分为训练集和测试集
train_reviews, test_reviews, train_labels, test_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)
# 构建词典
word2idx = {}
idx2word = {}
for i, word in enumerate(set(' '.join(train_reviews).split())):
word2idx[word] = i + 2
idx2word[i + 2] = word
word2idx['<pad>'] = 0
word2idx['<unk>'] = 1
idx2word[0] = '<pad>'
idx2word[1] = '<unk>'
# 将文本转换为数字序列
train_sequences = [[word2idx.get(word, 1) for word in review.split()] for review in train_reviews]
test_sequences = [[word2idx.get(word, 1) for word in review.split()] for review in test_reviews]
# 对数字序列进行填充,使其长度相同
train_sequences = torch.LongTensor([sequence + [0] * (100 - len(sequence))[:100] for sequence in train_sequences])
test_sequences = torch.LongTensor([sequence + [0] * (100 - len(sequence))[:100] for sequence in test_sequences])
# 将标签转换为0和1
train_labels = torch.FloatTensor(train_labels)
test_labels = torch.FloatTensor(test_labels)
# 定义超参数
VOCAB_SIZE = len(word2idx)
EMBEDDING_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [3]
OUTPUT_DIM = 1
DROPOUT = 0.5
BATCH_SIZE = 64
NUM_EPOCHS = 50
# 创建数据集和数据加载器
train_dataset = TextDataset(train_sequences, train_labels)
test_dataset = TextDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
# 创建模型、优化器和损失函数
model = CNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
# 训练模型
train_losses = []
train_accs = []
test_losses = []
test_accs = []
for epoch in range(NUM_EPOCHS):
train_loss, train_acc = train(model, train_loader, optimizer, criterion)
test_loss, test_acc, preds, true_labels = evaluate(model, test_loader, criterion)
train_losses.append(train_loss)
train_accs.append(train_acc)
test_losses.append(test_loss)
test_accs.append(test_acc)
print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
# 绘制折线图
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.plot(train_accs, label='Train Acc')
plt.plot(test_accs, label='Test Acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# 计算评估指标
test_preds = [1 if p > 0.5 else 0 for p in preds]
print('Test accuracy:', accuracy_score(true_labels, test_preds))
print('Test precision:', precision_score(true_labels, test_preds))
print('Test recall:', recall_score(true_labels, test_preds))
print('Test F1 score:', f1_score(true_labels, test_preds))
```
这段代码中,我们首先读取了CSV格式的数据集,并将其分为训练集和测试集。然后,我们构建了一个包含嵌入层、卷积层、全连接层和Dropout层的传统卷积神经网络,并使用Adam优化器和BCEWithLogitsLoss损失函数进行训练。在训练过程中,我们计算了每一轮的训练集和测试集的准确率和损失率,并将其保存在列表中。最后,我们使用Matplotlib绘制出训练集和测试集的准确率和损失率的折线图,并计算了测试集的准确率、精确率、召回率和F1分数。
阅读全文