Why is tensorflow checkpoint.restore
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
, not restoring decoder and encoder of the original notebook?
I was trying to restore a checkpoint from the following tensorflow article https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt in the different notebook. You do not need to go through the whole code or the article. The main points are that this is an NLP model and it has encoder and decoder. When I tried to restore the same model in the other colab notebook it does not seem to restore checkpoint variables, i.e. encoder and decoder and the function
tf.train.list_variables(tf.train.latest_checkpoint(checkpoint_dir))
shows that there are not variables in the model
What I am essentially doing is that after I have copied encoder and decoder functions from the original notebook:
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
##-------- LSTM layer in Encoder ------- ##
self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
def call(self, x, hidden):
x = self.embedding(x)
output, h, c = self.lstm_layer(x, initial_state = hidden)
return output, h, c
def initialize_hidden_state(self):
return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
class Decoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.dec_units = dec_units
self.attention_type = attention_type
# Embedding Layer
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
#Final Dense layer on which softmax will be applied
self.fc = tf.keras.layers.Dense(vocab_size)
# Define the fundamental cell for decoder recurrent structure
self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)
# Sampler
self.sampler = tfa.seq2seq.sampler.TrainingSampler()
# Create attention mechanism with memory = None
self.attention_mechanism = self.build_attention_mechanism(self.dec_units,
None, self.batch_sz*[max_length_input], self.attention_type)
# Wrap attention mechanism with the fundamental rnn cell of decoder
self.rnn_cell = self.build_rnn_cell(batch_sz)
# Define the decoder with respect to fundamental rnn cell
self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)
def build_rnn_cell(self, batch_sz):
rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell,
self.attention_mechanism, attention_layer_size=self.dec_units)
return rnn_cell
def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
# ------------- #
# typ: Which sort of attention (Bahdanau, Luong)
# dec_units: final dimension of attention outputs
# memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
# memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)
if(attention_type=='bahdanau'):
return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
else:
return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
def build_initial_state(self, batch_sz, encoder_state, Dtype):
decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
return decoder_initial_state
def call(self, inputs, initial_state):
x = self.embedding(inputs)
outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
return outputs
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, 'luong')
As you can see the embedding and other lairs are part of these functions. I am reinitializing both encoder and decoder, as they are required further in checkpoint.restore(). I run following:
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
However this give me a checkpoint value without any weights biases or any other variables for neither encoder nor decoder. Is the problem in that I am reinitilizing both encoder and decoder and how can I avoid it then? Rr is it completely different issue? If so, what is it? Should I use a completely different way of saving model, as this might be causing the issue?
Thank you for looking through the question and particularly for any of your suggestions. If the code I have posted is not enough - I can share the ipynb file for the google colab notebook, that I am trying to run.