TensorFlow Callback loss monitor not retrieving the loss for training data

20 Views Asked by At

I am building an encoder-decoder model for time-series which I have to compare with different data. To do a fair comparision, I am using a model checkpoint callback in TensorFlow. The monitor I am using is 'loss' (MSE loss in this particular case), save_best_only=True and save_weights_only=False. I then run another experience, this rime using 'val_loss' (also MSE), and keeping the remaing arguments the same.

When I fit my model I get it to save the best model for the training data with a particular train loss value. But when I load that best_model and try to make a prediction on the training data, I don't get the loss value that the model pointed to when performing training. This happens no matter the data, the batch_size, the epochs, the shape of the input and output data, etc. On the other hand, when I monitor "val_loss", and then load the best model and make a prediction on validation data I do get the loss that the fit was reporting about the validation set. Does anyone now why this happens?

My code follows:

np.random.seed(42)
tf.random.set_seed(42)
from numpy import array
from numpy import hstack
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out-1
        # check if we are beyond the dataset
        if out_end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1],  sequences[end_ix-1:out_end_ix, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)
# define input sequence
sequence_length=600
train_split=550
generate_sequence = lambda start: np.linspace(start, start + (sequence_length - 1) * 10, sequence_length)
in_seq1 = generate_sequence(10)
in_seq2 = generate_sequence(15)
generate_sequence_3 = lambda start: np.linspace(start, start + (sequence_length - 1) * 7, sequence_length)
in_seq3 = generate_sequence_3(17)
out_seq = array([in_seq1[i]+in_seq2[i]+in_seq3[i] for i in range(len(in_seq1))])
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
in_seq3 = in_seq3.reshape((len(in_seq3), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset_treino = hstack((in_seq1[:train_split], in_seq2[:train_split], in_seq3[:train_split], out_seq[:train_split]))
dataset_test = hstack((in_seq1[train_split:], in_seq2[train_split:], in_seq3[train_split:], out_seq[train_split:]))
# choose a number of time steps
n_steps_in, n_steps_out = 5,3
# n_units for encoder and decoder
n_units=500
# covert into input/output
X_train, y_train = split_sequences(dataset_treino, n_steps_in, n_steps_out)
X_test, y_test = split_sequences(dataset_test, n_steps_in, n_steps_out)
# the dataset knows the number of features, e.g. 2
n_features = X_train.shape[2]
n_target = 1
y_train = y_train.reshape(y_train.shape[0],y_train.shape[1],n_target)
y_test = y_test.reshape(y_test.shape[0],y_test.shape[1],n_target)

def train_generic_encoder_data_set(X, y, n_steps_out):
    X = copy.deepcopy(X)
    y = copy.deepcopy(y)
    X_encoder_in = X
    y_decoder_out=X
    X_decoder_in = np.zeros((X.shape[0], n_steps_out, X.shape[2]))
    y_decoder_out = np.zeros((X.shape[0], n_steps_out, X.shape[2]))
    
    X_decoder_in[:, 0] = X[:, -1, :]

    
    if n_steps_out==2:
        for i in range(0,X.shape[0]):
            if i+n_steps_out>=X.shape[0]:
                break
            X_decoder_in[i,1] = X[i+1, -1, :]
            y_decoder_out[i-1,0,:] = X[i,-1, :]
            y_decoder_out[i-1,1,:] = X[i+1,-1, :]

    elif n_steps_out==3:
        for i in range(0,X.shape[0]):
            if i+n_steps_out>=X.shape[0]:
                break
            X_decoder_in[i,1] = X[i+1, -1, :]
            X_decoder_in[i,2] = X[i+2, -1, :]
            y_decoder_out[i,0,:] = X[i+1,-1, :]
            y_decoder_out[i,1,:] = X[i+2,-1, :]
            y_decoder_out[i,2,:] = X[i+3,-1, :]
    
    if n_steps_out==1:
        for i in range(0,X.shape[0]):#
            if i+n_steps_out>=X.shape[0]:
                break
            X_decoder_in[:, 0] = X[:, -1, :] 
            y_decoder_out[i,0,:] = X[i+1,-1, :]
    
    #y_decoder_out=y

    return X_encoder_in, X_decoder_in, y_decoder_out

train_X, decoder_input_data_train, train_y = train_generic_encoder_data_set (X_train,y_train,n_steps_out)
test_X, decoder_input_data_test, test_y = train_generic_encoder_data_set (X_test,y_test,n_steps_out)

train_X, decoder_input_data_train, train_y = train_X[:-5,:,:], decoder_input_data_train[:-5,:,:], train_y[:-5,:,:]
test_X, decoder_input_data_test, test_y = test_X[:-5,:,:], decoder_input_data_test[:-5,:,:], test_y[:-5,:,:]

#Training Encoder (to learn x'1,x'2,x'3,x'4)
encoder_input = Input(shape=(train_X.shape[1], train_X.shape[2]))
encoding_layer = LSTM(n_units,return_sequences=True,return_state=True, activation='relu')
encoder_outputs, hidden_state, cell_state = encoding_layer(encoder_input)
context_vector= [hidden_state, cell_state]

#Training Decoder (to learn x'1,x'2,x'3,x'4)
features_predictions_list=[]
decoder_input_data = Input(shape=(decoder_input_data_train.shape[1],decoder_input_data_train.shape[2]))
decoding_layer = LSTM(n_units,return_sequences=True,return_state=True, activation='relu')
decoder_outputs, hidden_state, cell_state = decoding_layer(decoder_input_data, initial_state=context_vector)
features_predictions = Dense(n_features, kernel_initializer='ones', activation='relu')(decoder_outputs)
print('decoder_input_data: ', decoder_input_data)
features_predictions_list.append(features_predictions)
features_predictions_list = tf.keras.backend.concatenate(features_predictions_list, axis=1)

model = Model(inputs=[encoder_input,decoder_input_data], outputs=features_predictions_list, name='Modelo22')
optimizer=tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer, loss=['MSE'], metrics=['MSE'])
model.summary()
tf.keras.utils.plot_model(model, show_shapes=True)

# Create Tensorflow checkpoint object which monitors the validation accuracy
checkpoint_best_path = f'model_checkpoints_best/name_{model.name}__n_steps_in_{n_steps_in}__n_steps_out_{n_steps_out}__n_targets_{model.layers[4].units}.h5'
checkpoint_best = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_best_path, save_weights_only=False, save_freq = 'epoch', mode = 'auto', monitor = "loss", save_best_only = True, verbose=1)

#fit the model
history = model.fit([train_X, decoder_input_data_train], train_y, 
                    validation_data = ([test_X, decoder_input_data_test], test_y),
                        epochs=50, callbacks=[checkpoint_best], 
                        verbose=2, shuffle=False)
del model
del history
best_model_train = tf.keras.models.load_model('model_checkpoints_best/name_Modelo22__n_steps_in_5__n_steps_out_3__n_targets_3.h5')

fitted_values = best_model_train.predict([train_X, decoder_input_data_train])
train_mse = tf.math.reduce_mean(tf.keras.losses.mean_squared_error(train_y, fitted_values))

My train_mse is different from the reported "best loss" given in training. But if I shifted the code to monitor 'val_loss' and then load that saved model and calculate predicted_values = best_model_test.predict([test_X, decoder_input_data_test]) and then test_mse = tf.math.reduce_mean(tf.keras.losses.mean_squared_error(test_y, predicted_values))

Any help is welcome

0

There are 0 best solutions below