使用PyTorch和Transformers库中的BERT模型进行文本分类的代码
时间: 2023-06-10 18:03:38 浏览: 167
可以参考以下代码:
```
import torch
from transformers import BertTokenizer, BertForSequenceClassification
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Define model architecture
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
num_labels=2,
output_attentions=False,
output_hidden_states=False)
# Define example texts and corresponding labels
texts=["This is a positive text.", "This is a negative text."]
labels=[1, 0]
# Tokenize example texts and create input tensors
input_ids = []
attention_masks = []
for text in texts:
encoded_dict = tokenizer.encode_plus(
text, # Text to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 64, # Pad & truncate all sentences.
pad_to_max_length = True,
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
)
# Add the encoded sentence to the list
input_ids.append(encoded_dict['input_ids'])
# And its attention mask (simply differentiates padding from non-padding)
attention_masks.append(encoded_dict['attention_mask'])
# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
# Define dataloader
batch_size = 2 # You can set this to any value as per your requirement
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=batch_size)
# Define optimizer and training parameters
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
epochs = 10
total_steps = len(dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
# Train the model
for epoch in range(epochs):
for batch in dataloader:
# Set model to training mode
model.train()
# Unpack the variables from dataloader
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Clear any previously calculated gradients before backward pass
model.zero_grad()
# Perform a forward pass
loss, logits = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# Perform a backward pass to calculate gradients
loss.backward()
# Update parameters and scheduler
optimizer.step()
scheduler.step()
# Test the model
model.eval()
# Change label to word mapping as per your requirement
label_map = {0: "negative", 1: "positive"}
# Define test texts
texts=["This is a positive test.", "This is a negative test."]
# Tokenize test texts and create input tensors
input_ids = []
attention_masks = []
for text in texts:
encoded_dict = tokenizer.encode_plus(
text, # Text to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 64, # Pad & truncate all sentences.
pad_to_max_length = True,
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
)
# Add the encoded sentence to the list
input_ids.append(encoded_dict['input_ids'])
# And its attention mask (simply differentiates padding from non-padding)
attention_masks.append(encoded_dict['attention_mask'])
# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# Load tensors to device
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
# Perform a forward pass to get predicted classes for test texts
with torch.no_grad():
outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
logits = outputs[0]
# Get predicted classes for test texts
_, predicted_labels = torch.max(logits, dim=1)
# Print predicted labels
for text, label in zip(texts, predicted_labels):
print("Text: ", text)
print("Predicted Label: ", label_map[label.item()])
```
阅读全文