将一个txt文件变成训练集和测试集后用bert分类
时间: 2023-08-21 19:06:23 浏览: 48
首先,需要将txt文件读取为一个DataFrame对象,其中包含文本数据和标签数据。可以使用pandas库来实现:
```python
import pandas as pd
# 读取txt文件
with open('data.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
# 将数据转换为DataFrame对象
df = pd.DataFrame({'text': lines})
# 假设标签数据在每行的前两个字符中,提取标签并添加到DataFrame中
df['label'] = df['text'].apply(lambda x: x[:2])
df['text'] = df['text'].apply(lambda x: x[2:])
```
接下来,可以将DataFrame对象分成训练集和测试集。可以使用sklearn库的train_test_split函数来实现:
```python
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
```
然后,可以使用transformers库中的BertTokenizer和BertForSequenceClassification来对文本进行编码和分类。首先,需要下载预训练的BERT模型:
```python
from transformers import BertTokenizer, BertForSequenceClassification
# 下载Bert模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
```
num_labels是标签的数量,这里假设有2个标签。接下来,需要将文本编码成BERT模型可以处理的格式:
```python
import torch
# 编码文本
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True)
# 将标签转换为torch.tensor对象
train_labels = torch.tensor(train_df['label'].tolist())
test_labels = torch.tensor(test_df['label'].tolist())
# 创建数据集
class TextClassificationDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = self.labels[idx]
return item
def __len__(self):
return len(self.labels)
train_dataset = TextClassificationDataset(train_encodings, train_labels)
test_dataset = TextClassificationDataset(test_encodings, test_labels)
```
最后,可以使用PyTorch来训练和评估模型:
```python
from torch.utils.data import DataLoader
# 训练模型
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()
for epoch in range(3):
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
# 评估模型
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
model.eval()
correct = 0
total = 0
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
predictions = torch.argmax(logits, dim=1)
correct += (predictions == labels).sum().item()
total += labels.size(0)
print(f'Accuracy: {correct / total}')
```