基于卷积神经网络对文本数据集进行二分类,先对现有中文文本进行读取,文本为两列分别为lebal和review,lebal为标签列,reveiw为内容列,对标签列进行读取,标签分别为0和1共两类,随机将两类别的各80%作为训练集,20%作为训练集,在训练时首先对reveiw列文本转换为数字序列,选择最长的句子作为序列定长,其他长度进行按0填充,并对其他建立一个传统卷积神经网络,卷积核大小为3,池化层为平均池化,训练轮数为50,输出训练集和测试集每一轮的准确率和损失率并作一个折线图,基于PYtorch建立模型,给出详细的代码
时间: 2023-11-28 10:06:42 浏览: 69
以下是基于PyTorch的代码实现:
```
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import jieba
# 读取数据,分别为标签和文本内容
df = pd.read_csv('data.csv')
labels = df['label'].values
reviews = df['review'].values
# 对文本进行分词
reviews = [' '.join(jieba.cut(review)) for review in reviews]
# 将标签和文本内容转换为数字序列
word2idx = {}
for review in reviews:
for word in review.split():
if word not in word2idx:
word2idx[word] = len(word2idx)
reviews = [[word2idx[word] for word in review.split()] for review in reviews]
labels = [1 if label == 1 else 0 for label in labels]
# 将序列长度统一,不足的用0填充
max_len = max(len(review) for review in reviews)
reviews = [review + [0]*(max_len-len(review)) for review in reviews]
# 划分训练集和测试集
split_idx = int(len(reviews)*0.8)
train_reviews, train_labels = reviews[:split_idx], labels[:split_idx]
test_reviews, test_labels = reviews[split_idx:], labels[split_idx:]
# 定义数据集类
class TextDataset(Dataset):
def __init__(self, reviews, labels):
self.reviews = reviews
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
return torch.tensor(self.reviews[idx]), torch.tensor(self.labels[idx])
# 定义卷积神经网络模型
class CNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.convs = nn.ModuleList([
nn.Conv2d(in_channels=1,
out_channels=num_filters,
kernel_size=(fs, embedding_dim))
for fs in filter_sizes
])
self.fc = nn.Linear(len(filter_sizes)*num_filters, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
text = text.permute(1, 0)
embedded = self.embedding(text)
embedded = embedded.unsqueeze(1)
conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
pooled = [nn.functional.avg_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
cat = self.dropout(torch.cat(pooled, dim=1))
return self.fc(cat)
# 定义训练函数
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for text, label in iterator:
optimizer.zero_grad()
predictions = model(text)
loss = criterion(predictions, label.float())
acc = ((predictions > 0.5).float() == label.float()).sum() / len(label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
# 定义测试函数
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for text, label in iterator:
predictions = model(text)
loss = criterion(predictions, label.float())
acc = ((predictions > 0.5).float() == label.float()).sum() / len(label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
# 定义模型参数
VOCAB_SIZE = len(word2idx)
EMBEDDING_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [3]
OUTPUT_DIM = 1
DROPOUT = 0.5
# 实例化模型、损失函数和优化器
model = CNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())
# 定义训练集和测试集的数据迭代器
train_dataset = TextDataset(train_reviews, train_labels)
test_dataset = TextDataset(test_reviews, test_labels)
train_iterator = DataLoader(train_dataset, batch_size=64)
test_iterator = DataLoader(test_dataset, batch_size=64)
# 训练模型
train_losses = []
train_accs = []
test_losses = []
test_accs = []
for epoch in range(50):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
test_loss, test_acc = evaluate(model, test_iterator, criterion)
train_losses.append(train_loss)
train_accs.append(train_acc)
test_losses.append(test_loss)
test_accs.append(test_acc)
print(f'Epoch {epoch+1}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.3f}')
# 绘制训练损失和准确率的折线图
import matplotlib.pyplot as plt
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.legend()
plt.show()
plt.plot(train_accs, label='Train Acc')
plt.plot(test_accs, label='Test Acc')
plt.legend()
plt.show()
```
需要注意的是,在上述代码中,我们使用了jieba库对中文文本进行分词,并使用了PyTorch提供的Dataset和DataLoader类来实现数据集和数据迭代器的定义。另外,我们使用了一个名为CNN的类来实现卷积神经网络模型,并使用了BCEWithLogitsLoss作为损失函数,Adam作为优化器。最后,我们使用matplotlib库绘制了训练损失和准确率的折线图。
阅读全文