写一个Transformer模型
时间: 2023-10-14 14:15:02 浏览: 88
好的,以下是一个简单的Transformer模型:
```python
import torch
import torch.nn as nn
class TransformerModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_heads):
super(TransformerModel, self).__init__()
self.embedding = nn.Embedding(input_size, hidden_size)
self.positional_encoding = PositionalEncoding(hidden_size)
self.layers = nn.ModuleList([TransformerLayer(hidden_size, num_heads) for _ in range(num_layers)])
self.fc = nn.Linear(hidden_size, input_size)
def forward(self, x):
x = self.embedding(x)
x = self.positional_encoding(x)
for layer in self.layers:
x = layer(x)
x = self.fc(x)
return x
class PositionalEncoding(nn.Module):
def __init__(self, hidden_size):
super(PositionalEncoding, self).__init__()
self.hidden_size = hidden_size
pe = torch.zeros(1000, hidden_size)
position = torch.arange(0, 1000, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, hidden_size, 2).float() * (-math.log(10000.0) / hidden_size))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return x
class TransformerLayer(nn.Module):
def __init__(self, hidden_size, num_heads):
super(TransformerLayer, self).__init__()
self.self_attention = MultiHeadAttention(hidden_size, num_heads)
self.feed_forward = nn.Sequential(nn.Linear(hidden_size, hidden_size * 4),
nn.ReLU(),
nn.Linear(hidden_size * 4, hidden_size))
self.layer_norm1 = nn.LayerNorm(hidden_size)
self.layer_norm2 = nn.LayerNorm(hidden_size)
def forward(self, x):
residual = x
x = self.layer_norm1(x)
x = self.self_attention(x)
x = x + residual
residual = x
x = self.layer_norm2(x)
x = self.feed_forward(x)
x = x + residual
return x
class MultiHeadAttention(nn.Module):
def __init__(self, hidden_size, num_heads):
super(MultiHeadAttention, self).__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.q_linear = nn.Linear(hidden_size, hidden_size)
self.v_linear = nn.Linear(hidden_size, hidden_size)
self.k_linear = nn.Linear(hidden_size, hidden_size)
self.out_linear = nn.Linear(hidden_size, hidden_size)
def forward(self, x):
batch_size = x.size(0)
q = self.q_linear(x).view(batch_size, -1, self.num_heads, self.hidden_size // self.num_heads).transpose(1,2)
k = self.k_linear(x).view(batch_size, -1, self.num_heads, self.hidden_size // self.num_heads).transpose(1,2)
v = self.v_linear(x).view(batch_size, -1, self.num_heads, self.hidden_size // self.num_heads).transpose(1,2)
scores = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(self.hidden_size // self.num_heads)
attention = torch.softmax(scores, dim=-1)
x = torch.matmul(attention, v).transpose(1,2).contiguous().view(batch_size, -1, self.hidden_size)
x = self.out_linear(x)
return x
```
这里实现了一个简单的Transformer模型,包括了以下组件:
- Embedding层
- 位置编码层
- 多层Transformer层
- 多头注意力层
- 前馈神经网络层
- LayerNorm层
其中,多层Transformer层由多个多头注意力层和前馈神经网络层组成,每个注意力层和前馈神经网络层之间都有一个LayerNorm层进行归一化操作。在实现时,由于PyTorch没有提供Transformer层的封装,因此需要手动进行多层Transformer的组装。
阅读全文