Transformer的
时间: 2024-06-06 09:04:32 浏览: 170
Transformer是一种用于序列到序列(seq2seq)学习的神经网络模型,由“Attention is All You Need”一文提出。Transformer使用自注意力机制来处理变长输入,可以用于各种自然语言处理(NLP)任务,如机器翻译、文本摘要等。
下面是Transformer的基本结构[^1]:
- Encoder: 由多个编码器块组成,每个编码器块包含多头自注意力层和前馈网络。
- Decoder: 由多个解码器块组成,每个解码器块包含多头自注意力层、多头编码器-解码器注意力层和前馈网络。
- 损失函数: 一般使用交叉熵损失函数来计算模型预测结果与真实结果之间的误差。
训练步骤如下[^2]:
1. 首先将原始文本转换为词向量,然后将其传入Transformer的编码器中。
2. 编码器将输入的词向量序列转换为一组上下文感知的编码向量,并将其传递给解码器。
3. 解码器接受编码器的输出,并利用它们来生成输出序列的下一个词。
4. 模型根据下一个期望的输出词,预测下一个正确的输出。
5. 计算模型预测结果与真实结果之间的误差,并使用梯度下降法更新模型的参数。
下面是一个用于英语到德语的机器翻译的Transformer的例子[^3]:
```python
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, Embedding, Multiply, Add, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
# 构建Transformer模型
def transformer_model(src_input, tar_input, src_vocab_size, tar_vocab_size, d_model, n_heads, n_encoders, n_decoders, dropout=0.1):
# Encoder部分
# Input
src_word_embedding = Embedding(input_dim=src_vocab_size, output_dim=d_model)(src_input)
src_position_embedding = Embedding(input_dim=src_vocab_size, output_dim=d_model)(src_input)
src_encoder = Add()([src_word_embedding, src_position_embedding])
src_encoder = Dropout(dropout)(src_encoder)
# Encoder Blocks
for i in range(n_encoders):
src_encoder = encoder_block(src_encoder, d_model, n_heads, dropout)
# Decoder部分
# Input
tar_word_embedding = Embedding(input_dim=tar_vocab_size, output_dim=d_model)(tar_input)
tar_position_embedding = Embedding(input_dim=tar_vocab_size, output_dim=d_model)(tar_input)
tar_encoder = Add()([tar_word_embedding, tar_position_embedding])
tar_encoder = Dropout(dropout)(tar_encoder)
# Decoder Blocks
for i in range(n_decoders):
tar_encoder = decoder_block(tar_encoder, src_encoder, d_model, n_heads, dropout)
output = Dense(tar_vocab_size, activation='softmax')(tar_encoder)
model = Model(inputs=[src_input, tar_input], outputs=output)
return model
# Encoder Block
def encoder_block(inputs, d_model, n_heads, dropout=0.1):
# Multi-head Attention
attention_output, _ = MultiHeadAttention(n_heads, d_model)(inputs, inputs, inputs)
attention_output = Dropout(dropout)(attention_output)
attention_output = LayerNormalization(epsilon=1e-6)(Add()([inputs, attention_output]))
# Feed Forward
feed_forward_output = Dense(2048, activation='relu')(attention_output)
feed_forward_output = Dropout(dropout)(feed_forward_output)
feed_forward_output = Dense(d_model, activation=None)(feed_forward_output)
feed_forward_output = Dropout(dropout)(feed_forward_output)
feed_forward_output = LayerNormalization(epsilon=1e-6)(Add()([attention_output, feed_forward_output]))
return feed_forward_output
# Decoder Block
def decoder_block(inputs, enc_outputs, d_model, n_heads, dropout=0.1):
# Masked Multi-head Attention
attention_output_1, _ = MultiHeadAttention(n_heads, d_model)(inputs, inputs, inputs, mask='subsequent')
attention_output_1 = Dropout(dropout)(attention_output_1)
attention_output_1 = LayerNormalization(epsilon=1e-6)(Add()([inputs, attention_output_1]))
# Multi-head Attention
attention_output_2, _ = MultiHeadAttention(n_heads, d_model)(attention_output_1, enc_outputs, enc_outputs)
attention_output_2 = Dropout(dropout)(attention_output_2)
attention_output_2 = LayerNormalization(epsilon=1e-6)(Add()([attention_output_1, attention_output_2]))
# Feed Forward
feed_forward_output = Dense(2048, activation='relu')(attention_output_2)
feed_forward_output = Dropout(dropout)(feed_forward_output)
feed_forward_output = Dense(d_model, activation=None)(feed_forward_output)
feed_forward_output = Dropout(dropout)(feed_forward_output)
feed_forward_output = LayerNormalization(epsilon=1e-6)(Add()([attention_output_2, feed_forward_output]))
return feed_forward_output
# Multi-Head Attention
class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, n_heads, d_model):
super(MultiHeadAttention, self).__init__()
self.n_heads = n_heads
self.d_model = d_model
assert d_model % self.n_heads == 0
self.depth = d_model // self.n_heads
self.query = Dense(d_model, activation=None)
self.key = Dense(d_model, activation=None)
self.value = Dense(d_model, activation=None)
self.fc = Dense(d_model, activation=None)
def split_heads(self, inputs):
batch_size = tf.shape(inputs)
inputs = tf.reshape(inputs, shape=(batch_size, -1, self.n_heads, self.depth))
return tf.transpose(inputs, perm=[0, 2, 1, 3])
def call(self, q, k, v, mask=None):
q = self.query(q)
k = self.key(k)
v = self.value(v)
q = self.split_heads(q)
k = self.split_heads(k)
v = self.split_heads(v)
scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(scaled_attention, (tf.shape(scaled_attention), -1, self.d_model))
output = self.fc(concat_attention)
return output, attention_weights
def scaled_dot_product_attention(self, q, k, v, mask=None):
matmul_qk = tf.matmul(q, k, transpose_b=True)
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
scaled_attention_logits += (mask * -1e9)
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
output = tf.matmul(attention_weights, v)
return output, attention_weights
# 构建模型
src_input = Input(shape=(None,))
tar_input = Input(shape=(None,))
model = transformer_model(src_input, tar_input, 10000, 10000, 512, 8, 6, 6)
# 训练模型
loss_fn = SparseCategoricalCrossentropy(from_logits=True, reduction='none')
model.compile(optimizer=Adam(0.0001), loss=loss_fn)
model.fit(train_dataset, epochs=20, validation_data=val_dataset)
```
阅读全文
相关推荐










