用torch写一个命名实体识别的代码
时间: 2023-04-03 13:00:53 浏览: 128
lstm+crf、bilstm+crf 、LSTM CRF pytorch 命名实体识别代码
5星 · 资源好评率100%
可以使用PyTorch和torchtext库来实现命名实体识别。以下是一个简单的代码示例:
```
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.datasets import SequenceTaggingDataset
from torchtext.data import Field, BucketIterator
# 定义数据字段
TEXT = Field(lower=True, include_lengths=True)
TAGS = Field(unk_token=None)
# 加载数据集
train_data, valid_data, test_data = SequenceTaggingDataset.splits(
path='data',
train='train.txt',
validation='valid.txt',
test='test.txt',
fields=[('text', TEXT), ('tags', TAGS)],
separator='\t'
)
# 构建词汇表
TEXT.build_vocab(train_data)
TAGS.build_vocab(train_data)
# 定义模型
class NERModel(nn.Module):
def __init__(self, vocab_size, tag_size, embedding_dim, hidden_dim):
super(NERModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
self.fc = nn.Linear(hidden_dim * 2, tag_size)
def forward(self, text, text_lengths):
embedded = self.embedding(text)
packed = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
output, _ = self.lstm(packed)
output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
output = self.fc(output)
return output
# 初始化模型和优化器
model = NERModel(len(TEXT.vocab), len(TAGS.vocab), 100, 128)
optimizer = optim.Adam(model.parameters())
# 训练模型
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
datasets=(train_data, valid_data, test_data),
batch_size=32,
sort_key=lambda x: len(x.text),
sort_within_batch=True
)
for epoch in range(10):
for batch in train_iterator:
text, text_lengths = batch.text
tags = batch.tags
optimizer.zero_grad()
output = model(text, text_lengths)
loss = nn.functional.cross_entropy(output.view(-1, len(TAGS.vocab)), tags.view(-1))
loss.backward()
optimizer.step()
# 测试模型
def predict(model, sentence):
model.eval()
tokenized = [tok.lower() for tok in sentence.split()]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).unsqueeze()
text_lengths = torch.LongTensor([len(indexed)])
output = model(tensor, text_lengths)
_, predicted = torch.max(output, 2)
tags = [TAGS.vocab.itos[p.item()] for p in predicted[]]
return tags
sentence = "John lives in New York"
tags = predict(model, sentence)
print(tags)
```
这个代码示例使用了一个双向LSTM来对输入的文本进行编码,并使用一个全连接层来预测每个词的标签。在训练过程中,使用交叉熵损失函数来优化模型。在测试过程中,使用训练好的模型来预测输入文本中每个词的标签。
阅读全文