informer完整代码
时间: 2023-09-12 19:02:49 浏览: 169
Informer是一种用于时间序列预测的神经网络模型,其主要特点是使用了Transformer架构。以下是Informer的完整代码实现。
首先,我们需要导入所需的库:
```python
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
```
接下来,我们定义Informer的主体模型类:
```python
class Informer(nn.Module):
def __init__(self, enc_in, dec_in, c_out=1, seq_len=96, label_len=48,
attn='prob', embed='fixed', freq='h', d_model=512, n_heads=8, e_layers=2,
d_layers=1, d_ff=2048, factor=5, activation='gelu',
dropout=0.05, attn_dropout=0.0, embed_dropout=0.0):
super(Informer, self).__init__()
# Encoder and Decoder Input Embeddings
self.embed_in = nn.Linear(enc_in, d_model)
self.embed_out = nn.Linear(dec_in, d_model)
# Positional Encoding
self.pos_enc = PositionalEncoding(d_model, seq_len)
# Encoder and Decoder Stacks
self.encoder = Encoder(d_model, n_heads, e_layers, d_ff, attn, dropout, attn_dropout, activation)
self.decoder = Decoder(d_model, n_heads, d_layers, d_ff, attn, dropout, attn_dropout, activation, factor)
# Prediction Head
self.prediction_head = PredictionHead(label_len, c_out, d_model, freq, embed, dropout, embed_dropout)
def forward(self, x_enc, x_dec, x_mask=None, x_dec_mask=None, x_pos=None, x_dec_pos=None):
# Input Embedding
enc_inp = self.embed_in(x_enc)
dec_inp = self.embed_out(x_dec)
# Positional Encoding
enc_inp = self.pos_enc(enc_inp, x_pos)
dec_inp = self.pos_enc(dec_inp, x_dec_pos)
# Encoder
enc_out = self.encoder(enc_inp, x_mask)
# Decoder
dec_out = self.decoder(dec_inp, enc_out, x_mask, x_dec_mask)
# Prediction Head
pred = self.prediction_head(dec_out)
return pred
```
其中,`enc_in`是Encoder输入的维度,`dec_in`是Decoder输入的维度,`c_out`是输出的维度,`seq_len`是序列长度,`label_len`是预测的长度,`attn`是Attention机制的类型,`embed`是Embedding的类型,`freq`是时间序列的采样频率,`d_model`是Transformer中的Hidden Size,`n_heads`是Multi-Head Attention中的Head数,`e_layers`是Encoder中的Encoder Layer数,`d_layers`是Decoder中的Decoder Layer数,`d_ff`是Feed Forward网络的维度,`factor`是Decoder中的Attention Mask的因子,`activation`是激活函数,`dropout`是Dropout概率,`attn_dropout`是Attention Dropout概率,`embed_dropout`是Embedding Dropout概率。
我们还需要定义Positional Encoding的类:
```python
class PositionalEncoding(nn.Module):
def __init__(self, d_model, seq_len):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(seq_len, d_model)
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x, pos):
x = x + self.pe[:, pos, :]
return x
```
其中,`d_model`是Transformer中的Hidden Size,`seq_len`是序列长度。
接下来,我们定义Encoder和Decoder的类:
```python
class Encoder(nn.Module):
def __init__(self, d_model, n_heads, e_layers, d_ff, attn='prob', dropout=0.05, attn_dropout=0.0, activation='gelu'):
super(Encoder, self).__init__()
self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, attn, dropout, attn_dropout, activation) for i in range(e_layers)])
self.norm = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
for layer in self.layers:
x = layer(x, mask)
x = self.norm(x)
return x
class Decoder(nn.Module):
def __init__(self, d_model, n_heads, d_layers, d_ff, attn='prob', dropout=0.05, attn_dropout=0.0, activation='gelu', factor=5):
super(Decoder, self).__init__()
self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, attn, dropout, attn_dropout, activation, factor) for i in range(d_layers)])
self.norm = nn.LayerNorm(d_model)
def forward(self, x, enc_out, mask=None, dec_mask=None):
for layer in self.layers:
x = layer(x, enc_out, mask, dec_mask)
x = self.norm(x)
return x
```
其中,`d_model`是Transformer中的Hidden Size,`n_heads`是Multi-Head Attention中的Head数,`e_layers`是Encoder中的Encoder Layer数,`d_layers`是Decoder中的Decoder Layer数,`d_ff`是Feed Forward网络的维度,`attn`是Attention机制的类型,`dropout`是Dropout概率,`attn_dropout`是Attention Dropout概率,`activation`是激活函数,`factor`是Decoder中的Attention Mask的因子。
接下来,我们定义Encoder Layer和Decoder Layer的类:
```python
class EncoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, attn='prob', dropout=0.05, attn_dropout=0.0, activation='gelu'):
super(EncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, n_heads, attn, dropout, attn_dropout)
self.feed_forward = FeedForward(d_model, d_ff, activation, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
x = x + self.dropout(self.self_attn(x, x, x, mask))
x = self.norm1(x)
x = x + self.dropout(self.feed_forward(x))
x = self.norm2(x)
return x
class DecoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, attn='prob', dropout=0.05, attn_dropout=0.0, activation='gelu', factor=5):
super(DecoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, n_heads, attn, dropout, attn_dropout)
self.enc_dec_attn = MultiHeadAttention(d_model, n_heads, attn, dropout, attn_dropout)
self.feed_forward = FeedForward(d_model, d_ff, activation, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.factor = factor
def forward(self, x, enc_out, mask=None, dec_mask=None):
x = x + self.dropout(self.self_attn(x, x, x, dec_mask))
x = self.norm1(x)
x = x + self.dropout(self.enc_dec_attn(x, enc_out, enc_out, mask))
x = self.norm2(x)
x = x + self.dropout(self.feed_forward(x))
x = self.norm3(x)
return x
```
其中,`d_model`是Transformer中的Hidden Size,`n_heads`是Multi-Head Attention中的Head数,`d_ff`是Feed Forward网络的维度,`attn`是Attention机制的类型,`dropout`是Dropout概率,`attn_dropout`是Attention Dropout概率,`activation`是激活函数,`factor`是Decoder中的Attention Mask的因子。
接下来,我们定义Multi-Head Attention、Feed Forward和Prediction Head的类:
```python
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads, attn='prob', dropout=0.05, attn_dropout=0.0):
super(MultiHeadAttention, self).__init__()
self.n_heads = n_heads
self.d_head = d_model // n_heads
self.qkv = nn.Linear(d_model, 3*d_model)
self.attn = Attention(attn, dropout, attn_dropout)
self.proj = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
qkv = self.qkv(query).view(batch_size, -1, self.n_heads, self.d_head*3).transpose(1, 2)
q, k, v = qkv.chunk(3, dim=-1)
q = q.view(batch_size*self.n_heads, -1, self.d_head)
k = k.view(batch_size*self.n_heads, -1, self.d_head)
v = v.view(batch_size*self.n_heads, -1, self.d_head)
if mask is not None:
mask = mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1).view(batch_size*self.n_heads, 1, -1, query.size(-2))
out = self.attn(q, k, v, mask)
out = out.view(batch_size, self.n_heads, -1, self.d_head).transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads*self.d_head)
out = self.proj(out)
out = self.dropout(out)
return out
class Attention(nn.Module):
def __init__(self, attn='prob', dropout=0.05, attn_dropout=0.0):
super(Attention, self).__init__()
self.attn = attn
self.dropout = nn.Dropout(attn_dropout)
if self.attn == 'prob':
self.softmax = nn.Softmax(dim=-1)
elif self.attn == 'full':
self.softmax = nn.Softmax(dim=-1)
def forward(self, q, k, v, mask=None):
attn = torch.matmul(q, k.transpose(-2, -1))
if mask is not None:
attn = attn.masked_fill(mask == 0, -1e9)
attn = self.softmax(attn)
attn = self.dropout(attn)
out = torch.matmul(attn, v)
return out
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, activation='gelu', dropout=0.05):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.activation = getattr(nn, activation)()
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ff, d_model)
def forward(self, x):
x = self.linear1(x)
x = self.activation(x)
x = self.dropout(x)
x = self.linear2(x)
return x
class PredictionHead(nn.Module):
def __init__(self, label_len, c_out, d_model, freq='h', embed='fixed', dropout=0.05, embed_dropout=0.0):
super(PredictionHead, self).__init__()
self.label_len = label_len
self.c_out = c_out
self.freq = freq
if embed == 'fixed':
self.embed = nn.Linear(1, d_model)
elif embed == 'learned':
self.embed = nn.Parameter(torch.randn(label_len, d_model))
self.dropout = nn.Dropout(embed_dropout)
self.proj = nn.Linear(d_model, c_out)
def forward(self, x):
x = x[:, -self.label_len:, :]
if self.freq == 'h':
x = x[:, ::int(24/self.label_len), :]
if hasattr(self, 'embed'):
x = self.embed(x)
x = self.dropout(x)
x = self.proj(x)
return x
```
其中,`d_model`是Transformer中的Hidden Size,`n_heads`是Multi-Head Attention中的Head数,`attn`是Attention机制的类型,`dropout`是Dropout概率,`attn_dropout`是Attention Dropout概率,`activation`是激活函数,`label_len`是预测的长度,`c_out`是输出的维度,`freq`是时间序列的采样频率,`embed`是Embedding的类型,`embed_dropout`是Embedding Dropout概率。
现在,我们已经定义了Informer的完整代码实现。
阅读全文