self.register_buffer("attn_mask", attn_mask)
时间: 2023-06-19 18:07:04 浏览: 61
这段代码是 PyTorch 中用来注册一个缓存(buffer)的方法,其中 "attn_mask" 是缓存的名称,attn_mask 是被缓存的数据。缓存是一种 PyTorch 中的特殊张量,其不会参与模型参数的更新,但会被保存在模型的状态字典(state_dict)中。在模型的前向传播过程中,我们可以通过访问缓存来获取一些需要保存的中间结果,如此处的注意力掩码(attention mask)。缓存的注册通常在模型的初始化方法中进行,例如在 nn.Module 的子类中的 __init__() 方法中进行。
相关问题
基于pytorch的Transformer代码实例
Transformer 是一种非常流行的深度学习模型,被广泛应用于自然语言处理和语音识别等领域。在 PyTorch 中,可以使用 PyTorch 实现 Transformer 模型,下面是一个简单的代码实例:
```python
import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
self.depth = d_model // num_heads
self.wq = nn.Linear(d_model, d_model)
self.wk = nn.Linear(d_model, d_model)
self.wv = nn.Linear(d_model, d_model)
self.fc = nn.Linear(d_model, d_model)
def scaled_dot_product_attention(self, query, key, value):
matmul_qk = torch.matmul(query, key.transpose(-2, -1))
dk = torch.tensor(self.depth, dtype=torch.float32)
scaled_attention_logits = matmul_qk / torch.sqrt(dk)
attention_weights = F.softmax(scaled_attention_logits, dim=-1)
output = torch.matmul(attention_weights, value)
return output
def split_heads(self, x, batch_size):
x = x.reshape(batch_size, -1, self.num_heads, self.depth)
return x.transpose(1, 2)
def forward(self, query, key, value):
batch_size = query.shape
query = self.wq(query)
key = self.wk(key)
value = self.wv(value)
query = self.split_heads(query, batch_size)
key = self.split_heads(key, batch_size)
value = self.split_heads(value, batch_size)
scaled_attention = self.scaled_dot_product_attention(query, key, value)
scaled_attention = scaled_attention.transpose(1, 2)
concat_attention = scaled_attention.reshape(batch_size, -1, self.d_model)
output = self.fc(concat_attention)
return output
class TransformerBlock(nn.Module):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(TransformerBlock, self).__init__()
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = nn.Sequential(
nn.Linear(d_model, dff),
nn.ReLU(),
nn.Linear(dff, d_model),
)
self.layernorm1 = nn.LayerNorm(d_model)
self.layernorm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(rate)
self.dropout2 = nn.Dropout(rate)
def forward(self, x):
attn_output = self.mha(x, x, x)
attn_output = self.dropout1(attn_output)
out1 = self.layernorm1(x + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output)
out2 = self.layernorm2(out1 + ffn_output)
return out2
class Transformer(nn.Module):
def __init__(self,
input_vocab_size,
target_vocab_size,
max_len_input,
max_len_target,
num_layers=4,
d_model=128,
num_heads=8,
dff=512,
rate=0.1):
super(Transformer, self).__init__()
self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
self.decoder_embedding = nn.Embedding(target_vocab_size, d_model)
self.pos_encoding_input = PositionalEncoding(max_len_input, d_model)
self.pos_encoding_target = PositionalEncoding(max_len_target, d_model)
self.encoder_layers = nn.ModuleList([TransformerBlock(d_model,
num_heads,
dff,
rate) for _ in range(num_layers)])
self.decoder_layers = nn.ModuleList([TransformerBlock(d_model,
num_heads,
dff,
rate) for _ in range(num_layers)])
self.final_layer = nn.Linear(d_model, target_vocab_size)
def forward(self,
input_seq,
target_seq,
input_mask=None,
target_mask=None):
input_seq_embd = self.encoder_embedding(input_seq)
input_seq_embd *= torch.sqrt(torch.tensor(self.d_model))
input_seq_embd += self.pos_encoding_input(input_seq_embd)
target_seq_embd = self.decoder_embedding(target_seq)
target_seq_embd *= torch.sqrt(torch.tensor(self.d_model))
target_seq_embd += self.pos_encoding_target(target_seq_embd)
enc_output = input_seq_embd
for i in range(self.num_layers):
enc_output = self.encoder_layers[i](enc_output)
dec_output = target_seq_embd
for i in range(self.num_layers):
dec_output = self.decoder_layers[i](dec_output)
final_output = self.final_layer(dec_output)
return final_output
class PositionalEncoding(nn.Module):
def __init__(self, max_len, d_model):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x += self.pe[:x.size(0), :]
return x
```
这个代码实例中包括了 Multi-Head Attention、Transformer Block 和 Transformer 等模块,用于实现一个 Transformer 模型。你可以根据需要修改参数和模型结构来适应你的应用场景。
informer完整代码
Informer是一种用于时间序列预测的神经网络模型,其主要特点是使用了Transformer架构。以下是Informer的完整代码实现。
首先,我们需要导入所需的库:
```python
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
```
接下来,我们定义Informer的主体模型类:
```python
class Informer(nn.Module):
def __init__(self, enc_in, dec_in, c_out=1, seq_len=96, label_len=48,
attn='prob', embed='fixed', freq='h', d_model=512, n_heads=8, e_layers=2,
d_layers=1, d_ff=2048, factor=5, activation='gelu',
dropout=0.05, attn_dropout=0.0, embed_dropout=0.0):
super(Informer, self).__init__()
# Encoder and Decoder Input Embeddings
self.embed_in = nn.Linear(enc_in, d_model)
self.embed_out = nn.Linear(dec_in, d_model)
# Positional Encoding
self.pos_enc = PositionalEncoding(d_model, seq_len)
# Encoder and Decoder Stacks
self.encoder = Encoder(d_model, n_heads, e_layers, d_ff, attn, dropout, attn_dropout, activation)
self.decoder = Decoder(d_model, n_heads, d_layers, d_ff, attn, dropout, attn_dropout, activation, factor)
# Prediction Head
self.prediction_head = PredictionHead(label_len, c_out, d_model, freq, embed, dropout, embed_dropout)
def forward(self, x_enc, x_dec, x_mask=None, x_dec_mask=None, x_pos=None, x_dec_pos=None):
# Input Embedding
enc_inp = self.embed_in(x_enc)
dec_inp = self.embed_out(x_dec)
# Positional Encoding
enc_inp = self.pos_enc(enc_inp, x_pos)
dec_inp = self.pos_enc(dec_inp, x_dec_pos)
# Encoder
enc_out = self.encoder(enc_inp, x_mask)
# Decoder
dec_out = self.decoder(dec_inp, enc_out, x_mask, x_dec_mask)
# Prediction Head
pred = self.prediction_head(dec_out)
return pred
```
其中,`enc_in`是Encoder输入的维度,`dec_in`是Decoder输入的维度,`c_out`是输出的维度,`seq_len`是序列长度,`label_len`是预测的长度,`attn`是Attention机制的类型,`embed`是Embedding的类型,`freq`是时间序列的采样频率,`d_model`是Transformer中的Hidden Size,`n_heads`是Multi-Head Attention中的Head数,`e_layers`是Encoder中的Encoder Layer数,`d_layers`是Decoder中的Decoder Layer数,`d_ff`是Feed Forward网络的维度,`factor`是Decoder中的Attention Mask的因子,`activation`是激活函数,`dropout`是Dropout概率,`attn_dropout`是Attention Dropout概率,`embed_dropout`是Embedding Dropout概率。
我们还需要定义Positional Encoding的类:
```python
class PositionalEncoding(nn.Module):
def __init__(self, d_model, seq_len):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(seq_len, d_model)
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x, pos):
x = x + self.pe[:, pos, :]
return x
```
其中,`d_model`是Transformer中的Hidden Size,`seq_len`是序列长度。
接下来,我们定义Encoder和Decoder的类:
```python
class Encoder(nn.Module):
def __init__(self, d_model, n_heads, e_layers, d_ff, attn='prob', dropout=0.05, attn_dropout=0.0, activation='gelu'):
super(Encoder, self).__init__()
self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, attn, dropout, attn_dropout, activation) for i in range(e_layers)])
self.norm = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
for layer in self.layers:
x = layer(x, mask)
x = self.norm(x)
return x
class Decoder(nn.Module):
def __init__(self, d_model, n_heads, d_layers, d_ff, attn='prob', dropout=0.05, attn_dropout=0.0, activation='gelu', factor=5):
super(Decoder, self).__init__()
self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, attn, dropout, attn_dropout, activation, factor) for i in range(d_layers)])
self.norm = nn.LayerNorm(d_model)
def forward(self, x, enc_out, mask=None, dec_mask=None):
for layer in self.layers:
x = layer(x, enc_out, mask, dec_mask)
x = self.norm(x)
return x
```
其中,`d_model`是Transformer中的Hidden Size,`n_heads`是Multi-Head Attention中的Head数,`e_layers`是Encoder中的Encoder Layer数,`d_layers`是Decoder中的Decoder Layer数,`d_ff`是Feed Forward网络的维度,`attn`是Attention机制的类型,`dropout`是Dropout概率,`attn_dropout`是Attention Dropout概率,`activation`是激活函数,`factor`是Decoder中的Attention Mask的因子。
接下来,我们定义Encoder Layer和Decoder Layer的类:
```python
class EncoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, attn='prob', dropout=0.05, attn_dropout=0.0, activation='gelu'):
super(EncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, n_heads, attn, dropout, attn_dropout)
self.feed_forward = FeedForward(d_model, d_ff, activation, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
x = x + self.dropout(self.self_attn(x, x, x, mask))
x = self.norm1(x)
x = x + self.dropout(self.feed_forward(x))
x = self.norm2(x)
return x
class DecoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, attn='prob', dropout=0.05, attn_dropout=0.0, activation='gelu', factor=5):
super(DecoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, n_heads, attn, dropout, attn_dropout)
self.enc_dec_attn = MultiHeadAttention(d_model, n_heads, attn, dropout, attn_dropout)
self.feed_forward = FeedForward(d_model, d_ff, activation, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.factor = factor
def forward(self, x, enc_out, mask=None, dec_mask=None):
x = x + self.dropout(self.self_attn(x, x, x, dec_mask))
x = self.norm1(x)
x = x + self.dropout(self.enc_dec_attn(x, enc_out, enc_out, mask))
x = self.norm2(x)
x = x + self.dropout(self.feed_forward(x))
x = self.norm3(x)
return x
```
其中,`d_model`是Transformer中的Hidden Size,`n_heads`是Multi-Head Attention中的Head数,`d_ff`是Feed Forward网络的维度,`attn`是Attention机制的类型,`dropout`是Dropout概率,`attn_dropout`是Attention Dropout概率,`activation`是激活函数,`factor`是Decoder中的Attention Mask的因子。
接下来,我们定义Multi-Head Attention、Feed Forward和Prediction Head的类:
```python
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads, attn='prob', dropout=0.05, attn_dropout=0.0):
super(MultiHeadAttention, self).__init__()
self.n_heads = n_heads
self.d_head = d_model // n_heads
self.qkv = nn.Linear(d_model, 3*d_model)
self.attn = Attention(attn, dropout, attn_dropout)
self.proj = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
qkv = self.qkv(query).view(batch_size, -1, self.n_heads, self.d_head*3).transpose(1, 2)
q, k, v = qkv.chunk(3, dim=-1)
q = q.view(batch_size*self.n_heads, -1, self.d_head)
k = k.view(batch_size*self.n_heads, -1, self.d_head)
v = v.view(batch_size*self.n_heads, -1, self.d_head)
if mask is not None:
mask = mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1).view(batch_size*self.n_heads, 1, -1, query.size(-2))
out = self.attn(q, k, v, mask)
out = out.view(batch_size, self.n_heads, -1, self.d_head).transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads*self.d_head)
out = self.proj(out)
out = self.dropout(out)
return out
class Attention(nn.Module):
def __init__(self, attn='prob', dropout=0.05, attn_dropout=0.0):
super(Attention, self).__init__()
self.attn = attn
self.dropout = nn.Dropout(attn_dropout)
if self.attn == 'prob':
self.softmax = nn.Softmax(dim=-1)
elif self.attn == 'full':
self.softmax = nn.Softmax(dim=-1)
def forward(self, q, k, v, mask=None):
attn = torch.matmul(q, k.transpose(-2, -1))
if mask is not None:
attn = attn.masked_fill(mask == 0, -1e9)
attn = self.softmax(attn)
attn = self.dropout(attn)
out = torch.matmul(attn, v)
return out
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, activation='gelu', dropout=0.05):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.activation = getattr(nn, activation)()
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ff, d_model)
def forward(self, x):
x = self.linear1(x)
x = self.activation(x)
x = self.dropout(x)
x = self.linear2(x)
return x
class PredictionHead(nn.Module):
def __init__(self, label_len, c_out, d_model, freq='h', embed='fixed', dropout=0.05, embed_dropout=0.0):
super(PredictionHead, self).__init__()
self.label_len = label_len
self.c_out = c_out
self.freq = freq
if embed == 'fixed':
self.embed = nn.Linear(1, d_model)
elif embed == 'learned':
self.embed = nn.Parameter(torch.randn(label_len, d_model))
self.dropout = nn.Dropout(embed_dropout)
self.proj = nn.Linear(d_model, c_out)
def forward(self, x):
x = x[:, -self.label_len:, :]
if self.freq == 'h':
x = x[:, ::int(24/self.label_len), :]
if hasattr(self, 'embed'):
x = self.embed(x)
x = self.dropout(x)
x = self.proj(x)
return x
```
其中,`d_model`是Transformer中的Hidden Size,`n_heads`是Multi-Head Attention中的Head数,`attn`是Attention机制的类型,`dropout`是Dropout概率,`attn_dropout`是Attention Dropout概率,`activation`是激活函数,`label_len`是预测的长度,`c_out`是输出的维度,`freq`是时间序列的采样频率,`embed`是Embedding的类型,`embed_dropout`是Embedding Dropout概率。
现在,我们已经定义了Informer的完整代码实现。
相关推荐
![rar](https://img-home.csdnimg.cn/images/20210720083606.png)
![pdf](https://img-home.csdnimg.cn/images/20210720083512.png)
![whl](https://img-home.csdnimg.cn/images/20210720083646.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)