我想用手头的古诗词数据集做情感分类任务,请给我实现代码,包括前期的分词预处理到模型结构的搭建以及训练代码
时间: 2024-03-09 21:50:24 浏览: 164
基于情感分类的数据分析,使用了jieba评论分词等处理技术,情感词典,特征值提取,机器学习模型等分析预测技术+源代码+文档说明
5星 · 资源好评率100%
好的,这个任务可以分为以下几个步骤:
1. 数据预处理:读取数据集,对古诗词进行分词处理,并将其转换为可用于训练的格式
2. 模型搭建:使用预训练的中文BERT模型作为基础模型,加上一个全连接层,输出情感分类结果
3. 模型训练:使用预处理好的数据集对模型进行训练,并保存训练好的模型
代码实现如下:
```python
import os
import pandas as pd
import numpy as np
import jieba
import torch
import transformers
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)
# 定义设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 数据预处理
class PoetryDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
label = self.labels[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_token_type_ids=False,
return_attention_mask=True,
return_tensors='pt',
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
# 分词预处理
def tokenize(text):
words = jieba.cut(text)
return " ".join(words)
# 读取数据集
df = pd.read_csv("poetry.csv")
# 分词处理
df['text'] = df['text'].apply(tokenize)
# 将情感标签转换为数字
label_map = {'pos': 1, 'neg': 0}
df['label'] = df['label'].map(label_map)
# 划分训练集和测试集
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].to_numpy(),
df['label'].to_numpy(),
test_size=0.2,
random_state=42)
# 加载预训练的中文BERT模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
bert_model.to(device)
# 定义训练参数
batch_size = 8
epochs = 5
learning_rate = 2e-5
# 创建训练集和测试集的DataLoader
train_dataset = PoetryDataset(train_texts, train_labels, tokenizer, 128)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
val_dataset = PoetryDataset(val_texts, val_labels, tokenizer, 128)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)
# 定义优化器和学习率调度器
optimizer = AdamW(bert_model.parameters(), lr=learning_rate, eps=1e-8)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
# 定义训练函数
def train(model, train_dataloader, val_dataloader, epochs, optimizer, scheduler, device):
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in train_dataloader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
avg_train_loss = total_loss / len(train_dataloader)
print("Epoch {}/{} - Average training loss: {}".format(epoch+1, epochs, avg_train_loss))
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0
for batch in val_dataloader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_eval_loss += loss.item()
logits = outputs.logits
preds = torch.argmax(logits, dim=1).flatten()
labels = labels.flatten()
accuracy = torch.sum(preds == labels) / len(labels)
total_eval_accuracy += accuracy.item()
avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
avg_val_loss = total_eval_loss / len(val_dataloader)
print("Epoch {}/{} - Average validation accuracy: {}".format(epoch+1, epochs, avg_val_accuracy))
print("Epoch {}/{} - Average validation loss: {}".format(epoch+1, epochs, avg_val_loss))
print("="*50)
return model
# 训练模型
trained_model = train(bert_model, train_dataloader, val_dataloader, epochs, optimizer, scheduler, device)
# 保存训练好的模型
output_dir = './model_save/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
trained_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
```
阅读全文