一个XLM-RoBERTa模型微调的实例
时间: 2024-05-15 16:14:07 浏览: 244
NLPCC2020 预训练模型介绍_加载预训练模型
好的,下面是一个XLM-RoBERTa模型微调的实例:
首先,需要导入所需的库和数据集。我们使用Hugging Face的Transformers库进行微调,使用Facebook的XLM-RoBERTa预训练模型,以IMDB数据集为例进行微调。
```
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# 加载IMDB数据集
from transformers import datasets
raw_datasets = datasets.load_dataset("imdb")
# 切分训练集、验证集和测试集
train_dataset = raw_datasets["train"]
test_dataset = raw_datasets["test"]
train_val_split = train_test_split(train_dataset["text"], train_dataset["label"], test_size=0.2, random_state=42)
train_dataset["text"] = train_val_split[0]
train_dataset["label"] = train_val_split[1]
val_dataset = train_dataset.copy(deep=True)
train_val_split = train_test_split(train_dataset["text"], train_dataset["label"], test_size=0.2, random_state=42)
train_dataset["text"] = train_val_split[0]
train_dataset["label"] = train_val_split[1]
val_dataset["text"] = train_val_split[1]
val_dataset["label"] = train_val_split[3]
# 加载XLM-RoBERTa的tokenizer和模型
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)
# 设置参数
batch_size = 32
epochs = 4
lr = 2e-5
# 对训练集、验证集和测试集进行编码
train_encodings = tokenizer(train_dataset['text'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_dataset['text'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_dataset['text'].tolist(), truncation=True, padding=True)
# 将编码后的数据转换为TensorDataset
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
torch.tensor(train_encodings['attention_mask']),
torch.tensor(train_dataset['label']))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
torch.tensor(val_encodings['attention_mask']),
torch.tensor(val_dataset['label']))
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
torch.tensor(test_encodings['attention_mask']))
# 创建DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset))
test_loader = DataLoader(test_dataset, batch_size=batch_size, sampler=SequentialSampler(test_dataset))
# 将模型移动到GPU上
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# 定义优化器和学习率调度器
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
# 定义训练和验证的函数
def train(model, train_loader, val_loader, optimizer, scheduler, epochs, device):
for epoch in range(epochs):
model.train()
total_loss = 0
for i, batch in enumerate(train_loader):
input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
optimizer.zero_grad()
outputs = model(input_ids,
attention_mask=attention_mask,
labels=labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
avg_train_loss = total_loss / len(train_loader)
print(f"Epoch {epoch + 1} / {epochs}")
print(f"Average training loss: {avg_train_loss:.2f}")
model.eval()
val_preds = []
val_labels = []
for j, batch in enumerate(val_loader):
input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
with torch.no_grad():
outputs = model(input_ids,
attention_mask=attention_mask)
logits = outputs.logits
preds = torch.argmax(logits, dim=1).flatten()
val_preds.extend(preds.cpu().numpy())
val_labels.extend(labels.cpu().numpy())
val_acc = accuracy_score(val_labels, val_preds)
print(f"Validation accuracy: {val_acc:.2f}")
# 训练模型
train(model, train_loader, val_loader, optimizer, scheduler, epochs, device)
# 在测试集上评估模型
model.eval()
test_preds = []
for k, batch in enumerate(test_loader):
input_ids, attention_mask = tuple(t.to(device) for t in batch)
with torch.no_grad():
outputs = model(input_ids,
attention_mask=attention_mask)
logits = outputs.logits
preds = torch.argmax(logits, dim=1).flatten()
test_preds.extend(preds.cpu().numpy())
test_labels = test_dataset.tensors[2].numpy()
test_acc = accuracy_score(test_labels, test_preds)
print(f"Test accuracy: {test_acc:.2f}")
```
在上面的代码中,我们首先加载IMDB数据集,并将其切分为训练集、验证集和测试集。然后,我们加载XLM-RoBERTa的tokenizer和模型,对训练集、验证集和测试集进行编码,并将编码后的数据转换为TensorDataset。接下来,我们使用DataLoader将数据分批加载,并将模型移动到GPU上。我们使用AdamW优化器和线性学习率调度器来微调模型。最后,我们定义了一个训练和验证函数,用于在训练集上微调模型,并在验证集上评估模型的性能。我们使用accuracy_score来计算准确率,最后在测试集上评估模型的性能。
阅读全文