自然语言处理 基于神经网络的THUCNews 代码和得到的结果
时间: 2024-02-24 15:54:42 浏览: 101
自然语言处理中的神经网络方法
5星 · 资源好评率100%
THUCNews是一个中文文本分类数据集,可以使用基于神经网络的方法来进行文本分类。以下是一个使用Python和PyTorch框架实现THUCNews文本分类的代码示例:
```python
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.autograd import Variable
import numpy as np
import pandas as pd
import jieba
import re
# 读取数据集并进行预处理
def read_data(file):
data = pd.read_csv(file, header=None, sep='\t')
data.columns = ['label', 'content']
data = data.dropna()
data = data.reset_index(drop=True)
data['content'] = data['content'].apply(lambda x: re.sub('\s+', ' ', x))
data['content'] = data['content'].apply(lambda x: ' '.join(jieba.cut(x)))
return data
# 构建词典
def build_vocab(data):
word_dict = {}
for content in data['content']:
for word in content.split(' '):
if word not in word_dict:
word_dict[word] = len(word_dict) + 1
return word_dict
# 将文本转化为词向量
def text2vec(data, word_dict):
content_vec = []
for content in data['content']:
content_vec.append([word_dict[word] for word in content.split(' ')])
return content_vec
# 定义模型
class TextCNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, class_num, kernel_num, kernel_sizes, dropout):
super(TextCNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.convs = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, embedding_dim)) for K in kernel_sizes])
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(len(kernel_sizes) * kernel_num, class_num)
def forward(self, x):
x = self.embedding(x)
x = x.unsqueeze(1)
x = [nn.functional.relu(conv(x)).squeeze(3) for conv in self.convs]
x = [nn.functional.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
x = torch.cat(x, 1)
x = self.dropout(x)
logit = self.fc(x)
return logit
# 定义训练函数
def train(model, train_iter, val_iter, optimizer, criterion, num_epochs):
for epoch in range(num_epochs):
running_loss = 0.0
running_acc = 0.0
model.train()
for batch in train_iter:
x, y = batch.content, batch.label
x = Variable(x)
y = Variable(y)
optimizer.zero_grad()
output = model(x)
loss = criterion(output, y)
running_loss += loss.data[0] * y.size(0)
_, pred = torch.max(output, 1)
num_correct = (pred == y).sum()
running_acc += num_correct.data[0]
loss.backward()
optimizer.step()
epoch_loss = running_loss / len(train_data)
epoch_acc = running_acc / len(train_data)
val_loss, val_acc = evaluate(model, val_iter, criterion)
print('Epoch: {}, Training Loss: {:.4f}, Training Acc: {:.4f}, Validation Loss: {:.4f}, Validation Acc: {:.4f}'.format(epoch+1, epoch_loss, epoch_acc, val_loss, val_acc))
# 定义评估函数
def evaluate(model, val_iter, criterion):
running_loss = 0.0
running_acc = 0.0
model.eval()
for batch in val_iter:
x, y = batch.content, batch.label
x = Variable(x)
y = Variable(y)
output = model(x)
loss = criterion(output, y)
running_loss += loss.data[0] * y.size(0)
_, pred = torch.max(output, 1)
num_correct = (pred == y).sum()
running_acc += num_correct.data[0]
epoch_loss = running_loss / len(val_data)
epoch_acc = running_acc / len(val_data)
return epoch_loss, epoch_acc
# 设置超参数
vocab_size = 50000
embedding_dim = 100
class_num = 14
kernel_num = 100
kernel_sizes = [3, 4, 5]
dropout = 0.5
lr = 1e-3
batch_size = 128
num_epochs = 10
# 读取数据集
train_data = read_data('train.txt')
val_data = read_data('val.txt')
test_data = read_data('test.txt')
# 构建词典
word_dict = build_vocab(train_data)
# 将文本转化为词向量
train_content_vec = text2vec(train_data, word_dict)
val_content_vec = text2vec(val_data, word_dict)
test_content_vec = text2vec(test_data, word_dict)
# 将词向量转化为Tensor
train_content_tensor = torch.LongTensor(train_content_vec)
train_label_tensor = torch.LongTensor(np.array(train_data['label']))
val_content_tensor = torch.LongTensor(val_content_vec)
val_label_tensor = torch.LongTensor(np.array(val_data['label']))
test_content_tensor = torch.LongTensor(test_content_vec)
test_label_tensor = torch.LongTensor(np.array(test_data['label']))
# 构建数据集和数据迭代器
train_dataset = data.TensorDataset(train_content_tensor, train_label_tensor)
val_dataset = data.TensorDataset(val_content_tensor, val_label_tensor)
test_dataset = data.TensorDataset(test_content_tensor, test_label_tensor)
train_iter = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_iter = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_iter = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
# 初始化模型、优化器和损失函数
model = TextCNN(vocab_size, embedding_dim, class_num, kernel_num, kernel_sizes, dropout)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
# 训练模型并评估
train(model, train_iter, val_iter, optimizer, criterion, num_epochs)
test_loss, test_acc = evaluate(model, test_iter, criterion)
print('Test Loss: {:.4f}, Test Acc: {:.4f}'.format(test_loss, test_acc))
```
以上代码中使用了TextCNN模型,它是一种基于卷积神经网络的文本分类模型。训练过程中使用了Adam优化器和交叉熵损失函数。在训练完模型后,我们使用测试集对模型进行了评估,并输出了测试集上的损失和准确率。
阅读全文