建立transformer模型根据商品信息对商品进行识别,例如:将“实惠装保鲜袋 芳草地 2030cm+2535cm+30*40cm”识别为一次性塑料制品,python代码示例
时间: 2024-02-13 09:00:03 浏览: 25
可以使用transformer模型进行文本分类,将商品信息作为文本输入,输出为商品类别。以下是一个简单的Python代码示例:
首先,需要安装transformers和torch库:
```python
pip install transformers torch
```
然后,可以使用Hugging Face提供的预训练模型(如bert-base-chinese)和数据集(如THUCNews)进行微调,也可以自己构建模型和数据集。
在这里,我们使用bert-base-chinese模型和自己构建的商品数据集进行微调。
商品数据集包含两个文件:train.tsv和test.tsv,每个文件包含两列:商品信息和类别。例如:
```
商品信息 类别
实惠装保鲜袋 芳草地 2030cm+2535cm+30*40cm 一次性塑料制品
...
```
接下来,我们加载数据集并进行预处理:
```python
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
class ProductDataset(Dataset):
def __init__(self, file_path, tokenizer, max_len):
self.tokenizer = tokenizer
self.max_len = max_len
self.data = []
self.labels = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip().split('\t')
if len(line) != 2:
continue
text, label = line
self.data.append(text)
self.labels.append(label)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text = self.data[idx]
label = self.labels[idx]
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
truncation=True
)
return {
'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
'label': torch.tensor(self.label2id[label], dtype=torch.long)
}
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
max_len = 128
train_dataset = ProductDataset('train.tsv', tokenizer, max_len)
test_dataset = ProductDataset('test.tsv', tokenizer, max_len)
```
接下来,我们定义模型和训练过程:
```python
from transformers import BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=len(train_dataset.label2id))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
optimizer = AdamW(model.parameters(), lr=2e-5)
for epoch in range(5):
model.train()
train_loss = 0
train_acc = 0
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
token_type_ids = batch['token_type_ids'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
loss = outputs.loss
logits = outputs.logits
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
train_acc += accuracy_score(labels.cpu(), logits.argmax(axis=1).cpu())
train_loss /= len(train_loader)
train_acc /= len(train_loader)
model.eval()
test_loss = 0
test_acc = 0
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
token_type_ids = batch['token_type_ids'].to(device)
labels = batch['label'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
loss = outputs.loss
logits = outputs.logits
test_loss += loss.item()
test_acc += accuracy_score(labels.cpu(), logits.argmax(axis=1).cpu())
test_loss /= len(test_loader)
test_acc /= len(test_loader)
print(f'Epoch {epoch + 1}:')
print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')
```
最后,我们可以使用模型进行预测:
```python
def predict(text):
inputs = tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=max_len,
padding='max_length',
truncation=True
)
input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0).to(device)
attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0).to(device)
token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long).unsqueeze(0).to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
logits = outputs.logits
label_id = logits.argmax(axis=1).item()
label = train_dataset.id2label[label_id]
return label
text = '实惠装保鲜袋 芳草地 2030cm+2535cm+30*40cm'
label = predict(text)
print(label) # 输出:一次性塑料制品
```
这就是一个简单的基于transformer模型的商品识别系统。