将一个txt文件变成训练集和测试集后用bert分类的代码
时间: 2024-02-01 11:15:19 浏览: 83
下面是将一个txt文件分割成训练集和测试集,并用BERT进行分类的Python代码示例:
```python
import random
import torch
from transformers import BertTokenizer, BertForSequenceClassification
# 读取txt文件
with open("data.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
# 随机打乱数据
random.shuffle(lines)
# 划分训练集和测试集
train_lines = lines[:int(len(lines)*0.8)]
test_lines = lines[int(len(lines)*0.8):]
# 初始化BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
# 定义训练函数
def train(model, train_dataloader):
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
for epoch in range(5):
for step, batch in enumerate(train_dataloader):
input_ids, attention_mask, labels = batch
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
print("Epoch: %d, Step: %d, Loss: %.4f" % (epoch+1, step+1, loss.item()))
# 定义测试函数
def test(model, test_dataloader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for batch in test_dataloader:
input_ids, attention_mask, labels = batch
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
_, predicted = torch.max(logits, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print("Accuracy: %.2f%%" % (100 * correct / total))
# 对训练集和测试集进行分词和转换成BERT的输入格式
train_input_ids = []
train_attention_masks = []
train_labels = []
for line in train_lines:
text, label = line.strip().split("\t")
encoded = tokenizer.encode_plus(text, max_length=128, padding='max_length', truncation=True)
train_input_ids.append(encoded['input_ids'])
train_attention_masks.append(encoded['attention_mask'])
train_labels.append(int(label))
test_input_ids = []
test_attention_masks = []
test_labels = []
for line in test_lines:
text, label = line.strip().split("\t")
encoded = tokenizer.encode_plus(text, max_length=128, padding='max_length', truncation=True)
test_input_ids.append(encoded['input_ids'])
test_attention_masks.append(encoded['attention_mask'])
test_labels.append(int(label))
# 转换成Tensor并创建DataLoader
train_input_ids = torch.tensor(train_input_ids)
train_attention_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
test_input_ids = torch.tensor(test_input_ids)
test_attention_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)
# 开始训练和测试
train(model, train_dataloader)
test(model, test_dataloader)
```
这个示例代码使用了BERT中文模型进行分类,将数据集分成了80%的训练集和20%的测试集。在训练和测试过程中,使用了PyTorch的DataLoader进行批量处理。您需要根据自己的需求修改代码中的参数,如batch_size、lr等。
阅读全文