I am trying to reproduce the original transformer for machine translation in PyTorch.
class Transformer(nn.Module):
def __init__(self, vocab_size_in, vocab_size_out, embedding_dim, n_heads, key_dim, value_dim, ffn_dim, n=10000,
eps=1e-5, padding_token_index=0, p_drop=0.1, n_encoder_layers=1, n_decoder_layers=1):
super(Transformer, self).__init__()
# parameters
self.key_dim = key_dim
self.n_heads = n_heads
self.embedding_dim = embedding_dim
self.eps = eps
self.ffn_dim = ffn_dim
self.padding_token_index = padding_token_index
self.vocab_size_in = vocab_size_in
# Embedding layers encoder
self.embedding_layer_enc = EmbeddingLayer(vocab_size_in, embedding_dim, n)
self.dropout_enc = nn.Dropout(p_drop)
# Encoder layers
self.encoder_layers = [EncoderLayer(embedding_dim, key_dim, value_dim, ffn_dim, n_heads, p_drop, eps)] * n_encoder_layers
# Embedding layers decoder
self.embedding_layer_dec = EmbeddingLayer(vocab_size_out, embedding_dim)
self.dropout_dec = nn.Dropout(p_drop)
# Decoder layers
self.decoder_layers = [DecoderLayer(embedding_dim, key_dim, value_dim, ffn_dim, n_heads, p_drop, eps)] * n_decoder_layers
# Linear output layer
self.output_linear = nn.Linear(embedding_dim, vocab_size_out)
def forward(self, input, target):
# ...
As you can see there are two dropout layers defined here. Moreover, I have further dropout layers in EncoderLayer and DecoderLayer.
class EncoderLayer(nn.Module):
def __init__(self, embedding_dim=512, key_dim=512, value_dim=512, ffn_dim=512, n_heads=8, p_drop=0.1, eps=1e-5):
super().__init__()
self.multi_head = MultiHeadAttentionLayer(n_heads, embedding_dim, key_dim, value_dim)
self.dropout_multi_head = nn.Dropout(p_drop)
self.norm_multi_head = LayerNormalization(embedding_dim, eps)
self.FFN_in = nn.Linear(embedding_dim, ffn_dim)
self.FFN_out = nn.Linear(ffn_dim, embedding_dim)
self.dropout_FFN = nn.Dropout(p_drop)
self.norm_FFN = LayerNormalization(embedding_dim, eps)
def forward(self, source, mask=None):
multi_head_out = self.multi_head(source, mask) # shape = (n_sentences, len_sentence, embedding_dim)
multi_head_out = self.dropout_multi_head(multi_head_out)
multi_head_norm = self.norm_multi_head(source + multi_head_out)
ffn_in = self.FFN_in(multi_head_norm)
ffn_in = F.relu(ffn_in)
ffn_out = self.FFN_out(ffn_in)
ffn_out = self.dropout_FFN(ffn_out)
enc_out = self.norm_FFN(multi_head_norm + ffn_out)
return enc_out
I am testing my code in evaluation mode. The forward step takes source and target sequences of indices and outputs a tensor of probabilities for each word in the output sequence. So far, the testing function is simple, I just wanted to make sure when I input the same thing the output stays the same:
def translate_sentence(model, source, target, max_num_words=200):
for i in range(max_num_words):
model.eval()
with torch.no_grad():
output = model(source, target)
print(output)
However, that does not happen. As suggested, I added:
print(model.dropout_enc.training)
print(model.encoder_layers[0].dropout_multi_head.training)
In order to check if dropout layers are active or not, and the output is:
False
True
Therefore, model.eval() disables dropout layers defined in the __init__ of Transformer, but not in its sublayers. Any idea on how to solve?
Solved: I had to use nn.ModuleList.
The
__repr__method ofnn.Dropoutdoesn't output that kind of information. That's why it will show the same whatever the mode the layer is on.It doesn't mean it isn't applied though, you can check for yourself!
You can check the mode of your sublayer or parent layer with the
trainingattribute:The reason why the mode is not propagated to your child layers is because you are using lists which means the layers are not registered as child modules. Instead, you should wrap your lists with
nn.ModuleListforencoder_layersanddecoder_layers.