transformer的实现原理
时间: 2023-12-10 21:36:03 浏览: 89
Transformer是一种基于自注意力机制的神经网络模型,主要用于自然语言处理任务,如机器翻译、文本摘要等。其主要思想是将输入序列和输出序列都看作是由词向量组成的序列,并通过多层的自注意力机制和前馈神经网络对其进行处理。具体实现原理如下:
1.输入嵌入层:将输入序列中的每个词转换为一个向量表示,这个向量表示可以是预训练的词向量,也可以是随机初始化的向量。
2.位置编码层:由于Transformer没有使用循环神经网络,因此需要一种方法来捕捉序列中词的位置信息。位置编码层就是为此而设计的,它将每个词的位置信息编码成一个向量,加到词向量中。
3.多头自注意力层:这是Transformer的核心层,它通过计算每个词与其他词的相似度得到每个词的权重,然后将这些权重作为加权系数对所有词向量进行加权求和,得到每个词的表示。为了增强模型的表达能力,多头自注意力层会计算多组不同的注意力,每组注意力都是通过不同的线性变换得到的。
4.前馈神经网络层:这一层对每个词的表示进行非线性变换,增强模型的表达能力。
5.残差连接和层归一化:为了避免梯度消失和梯度爆炸问题,Transformer在每个子层之后都使用了残差连接和层归一化。
6.编码器和解码器:Transformer包含编码器和解码器两部分,编码器用于将输入序列编码成一个固定长度的向量表示,解码器则将这个向量表示解码成输出序列。在解码器中,还使用了一个额外的注意力机制,用于对编码器输出的向量表示进行加权求和,得到每个时刻的上下文向量。
以下是一个简单的Transformer实现的代码示例:
```python
import torch
import torch.nn as nn
import torch.nn.functional as F
class Transformer(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, num_heads):
super(Transformer, self).__init__()
self.embedding = nn.Embedding(input_dim, hidden_dim)
self.position_encoding = PositionEncoding(hidden_dim)
self.encoder_layers = nn.ModuleList([EncoderLayer(hidden_dim, num_heads) for _ in range(num_layers)])
self.decoder_layers = nn.ModuleList([DecoderLayer(hidden_dim, num_heads) for _ in range(num_layers)])
self.fc = nn.Linear(hidden_dim, input_dim)
def forward(self, src, trg):
src_mask = self.generate_square_subsequent_mask(src.size(1))
trg_mask = self.generate_square_subsequent_mask(trg.size(1))
src_embedded = self.position_encoding(self.embedding(src))
trg_embedded = self.position_encoding(self.embedding(trg))
for layer in self.encoder_layers:
src_embedded = layer(src_embedded, src_mask)
for layer in self.decoder_layers:
trg_embedded = layer(trg_embedded, src_embedded, trg_mask, src_mask)
output = self.fc(trg_embedded)
return output
def generate_square_subsequent_mask(self, size):
mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
class PositionEncoding(nn.Module):
def __init__(self, hidden_dim, max_len=5000):
super(PositionEncoding, self).__init__()
self.dropout = nn.Dropout(p=0.1)
pe = torch.zeros(max_len, hidden_dim)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-math.log(10000.0) / hidden_dim))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class EncoderLayer(nn.Module):
def __init__(self, hidden_dim, num_heads):
super(EncoderLayer, self).__init__()
self.self_attention = MultiHeadAttention(hidden_dim, num_heads)
self.feed_forward = FeedForward(hidden_dim)
def forward(self, x, mask):
x = self.self_attention(x, x, x, mask)
x = self.feed_forward(x)
return x
class DecoderLayer(nn.Module):
def __init__(self, hidden_dim, num_heads):
super(DecoderLayer, self).__init__()
self.self_attention = MultiHeadAttention(hidden_dim, num_heads)
self.encoder_attention = MultiHeadAttention(hidden_dim, num_heads)
self.feed_forward = FeedForward(hidden_dim)
def forward(self, x, encoder_output, trg_mask, src_mask):
x = self.self_attention(x, x, x, trg_mask)
x = self.encoder_attention(x, encoder_output, encoder_output, src_mask)
x = self.feed_forward(x)
return x
class MultiHeadAttention(nn.Module):
def __init__(self, hidden_dim, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.head_dim = hidden_dim // num_heads
self.query = nn.Linear(hidden_dim, hidden_dim)
self.key = nn.Linear(hidden_dim, hidden_dim)
self.value = nn.Linear(hidden_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, hidden_dim)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
Q = self.query(query).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
K = self.key(key).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
V = self.value(value).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
if mask is not None:
scores = scores.masked_fill(mask == 0, float('-inf'))
attention = F.softmax(scores, dim=-1)
x = torch.matmul(attention, V)
x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.head_dim)
x = self.fc(x)
return x
class FeedForward(nn.Module):
def __init__(self, hidden_dim, dropout=0.1):
super(FeedForward, self).__init__()
self.fc1 = nn.Linear(hidden_dim, hidden_dim * 4)
self.fc2 = nn.Linear(hidden_dim * 4, hidden_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
```
阅读全文