建立transformer模型基于9万+商品信息进行训练和测试对商品品类自动识别,例如以下商品“抗菌密实袋 极货 12pcs 盒”识别为“一次性塑料制品”,python代码示例
时间: 2023-12-26 19:04:06 浏览: 79
好的,以下是一个基于 transformers 库和 PyTorch 框架的 Python 代码示例,可以用于训练和测试 transformer 模型对商品品类进行自动识别:
```python
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
# 定义数据集类,用于加载和处理训练和测试数据
class ProductDataset(Dataset):
def __init__(self, data_file, tokenizer):
self.data = []
with open(data_file, 'r', encoding='utf-8') as f:
for line in f:
label, text = line.strip().split('\t')
self.data.append((label, text))
self.tokenizer = tokenizer
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
label, text = self.data[idx]
inputs = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=128,
padding='max_length',
truncation=True,
return_token_type_ids=True
)
return {
'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
'label': torch.tensor(int(label), dtype=torch.long)
}
# 定义训练函数
def train(model, train_loader, optimizer, device):
model.train()
total_loss = 0.0
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
token_type_ids = batch['token_type_ids'].to(device)
label = batch['label'].to(device)
optimizer.zero_grad()
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
labels=label
)
loss = outputs.loss
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(train_loader)
# 定义评估函数
def evaluate(model, eval_loader, device):
model.eval()
total_correct = 0
with torch.no_grad():
for batch in eval_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
token_type_ids = batch['token_type_ids'].to(device)
label = batch['label'].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
labels=label
)
logits = outputs.logits
preds = torch.argmax(logits, dim=1)
total_correct += torch.sum(preds == label).item()
return total_correct / len(eval_loader.dataset)
# 加载 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=10)
# 加载训练和测试数据集
train_dataset = ProductDataset('train.txt', tokenizer)
eval_dataset = ProductDataset('eval.txt', tokenizer)
# 创建训练和测试数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=32)
# 定义训练参数和优化器
epochs = 10
learning_rate = 2e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)
# 将模型和数据加载到 GPU 上(如果可用)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# 开始训练和测试
for epoch in range(1, epochs+1):
train_loss = train(model, train_loader, optimizer, device)
eval_acc = evaluate(model, eval_loader, device)
print(f'Epoch {epoch} - train_loss: {train_loss:.4f} - eval_acc: {eval_acc:.4f}')
# 进行预测
text = '抗菌密实袋 极货 12pcs 盒'
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=128,
padding='max_length',
truncation=True,
return_token_type_ids=True
)
input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0).to(device)
attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0).to(device)
token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long).unsqueeze(0).to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids
)
logits = outputs.logits
pred = torch.argmax(logits, dim=1).item()
print(f'预测结果:{pred}')
```
在上面的示例代码中,我们定义了一个 `ProductDataset` 类来加载和处理训练和测试数据,并使用 `AutoTokenizer` 和 `AutoModelForSequenceClassification` 类加载了预训练的 BERT 模型。我们使用 `train` 和 `evaluate` 函数分别进行训练和评估,并使用 `AdamW` 优化器进行优化。最后,我们使用训练好的模型对指定的商品信息进行预测,并输出预测结果。
阅读全文