I am trying to build a deep learning network (USING TENSORFLOW KERAS) that performs a graph convolution, and at each node performs an LSTM computation. I want to add a MultiHeadAttention layer to the LSTM layer, but I am not sure if I am doing it correctly since the loss explodes at the beginning of the epochs and does not achieve a better loss than the model with only the LSTM (actually, the loss is worse).
For context, I have added a Bayesian optimization for hyperparameter tuning (which is NOT the problem).
The code snippet I am having problems with is this:
class LSTMGC(layers.Layer):
"""Layer comprising a convolution layer followed by LSTM and dense layers."""
def __init__(
self,
in_feat,
out_feat,
lstm_units: int,
input_seq_len: int,
output_seq_len: int,
graph_info: GraphInfo,
graph_conv_params: typing.Optional[dict] = None,
**kwargs,
):
super().__init__(**kwargs)
# graph conv layer
if graph_conv_params is None:
graph_conv_params = {
"aggregation_type": "mean",
"combination_type": "concat",
"activation": None,
}
self.graph_conv = GraphConv(in_feat, out_feat, graph_info, **graph_conv_params)
self.attn = layers.MultiHeadAttention(8, key_dim=100,
dropout=0.1, kernel_regularizer=tf.keras.regularizers.l1(0.1))
self.lstm = layers.LSTM(lstm_units, activation="tanh", return_sequences=True)
self.dense = layers.Dense(output_seq_len)
self.input_seq_len, self.output_seq_len = input_seq_len, output_seq_len
def call(self, inputs, lstm_units):
"""Forward pass.
Args:
inputs: tf.Tensor of shape `(batch_size, input_seq_len, num_nodes, in_feat)`
Returns:
A tensor of shape `(batch_size, output_seq_len, num_nodes)`.
"""
lstm_units=lstm_units
# convert shape to (num_nodes, batch_size, input_seq_len, in_feat)
inputs = tf.transpose(inputs, [2, 0, 1, 3])
gcn_out = self.graph_conv(
inputs
) # gcn_out has shape: (num_nodes, batch_size, input_seq_len, out_feat)
shape = tf.shape(gcn_out)
num_nodes, batch_size, input_seq_len, out_feat = (
shape[0],
shape[1],
shape[2],
shape[3],
)
# LSTM takes only 3D tensors as input
gcn_out = tf.reshape(gcn_out, (batch_size * num_nodes, input_seq_len, out_feat))
lstm_out = self.lstm(gcn_out)
# lstm_out has shape: (batch_size * num_nodes, input_seq_len, lstm_units)
attn_out = self.attn(lstm_out, lstm_out) # attn_out has shape: (batch_size * num_nodes,input_sequence_length, lstm_units)
#reshape to 2D for dense layer
attn_out = tf.reshape(attn_out, (batch_size * num_nodes,input_sequence_length*lstm_units)) #reshaping to (batch_size*num_nodes, input_sequence_length*lstm_units)
dense_output = self.dense(attn_out) # dense_output has shape: (batch_size * num_nodes, output_seq_len, units)
output = tf.reshape(dense_output, (num_nodes, batch_size, self.output_seq_len))
return tf.transpose(
output, [1, 2, 0]
) # returns Tensor of shape (batch_size, output_seq_len, num_nodes)
For reference, here is my complete code:
#Import libraries
import pandas as pd
import numpy as np
import typing
import matplotlib.pyplot as plt
import time
import tensorflow as tf
import keras_tuner as kt
from keras_tuner import HyperModel
from keras_tuner import BayesianOptimization
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import timeseries_dataset_from_array
#Splitting and normalizing data
def preprocess(data_array: np.ndarray, train_size: float, val_size: float):
"""Splits data into train/val/test sets and normalizes the data.
Args:
data_array: ndarray of shape `(num_time_steps, num_routes)`
train_size: A float value between 0.0 and 1.0 that represent the proportion of the dataset
to include in the train split.
val_size: A float value between 0.0 and 1.0 that represent the proportion of the dataset
to include in the validation split.
Returns:
`train_array`, `val_array`, `test_array`
"""
num_time_steps = data_array.shape[0]
num_train, num_val = (
int(num_time_steps * train_size),
int(num_time_steps * val_size),
)
train_array = data_array[:num_train]
mean, std = train_array.mean(axis=0), train_array.std(axis=0)
train_array = (train_array - mean) / std
val_array = (data_array[num_train : (num_train + num_val)] - mean) / std
test_array = (data_array[(num_train + num_val) :] - mean) / std
return train_array, val_array, test_array
#create TENSORFLOW Dataset
def create_tf_dataset(
data_array: np.ndarray,
input_sequence_length: int,
forecast_horizon: int,
batch_size: int = 128,
shuffle=True,
multi_horizon=True,
):
"""Creates tensorflow dataset from numpy array.
This function creates a dataset where each element is a tuple `(inputs, targets)`.
`inputs` is a Tensor
of shape `(batch_size, input_sequence_length, num_routes, 1)` containing
the `input_sequence_length` past values of the timeseries for each node.
`targets` is a Tensor of shape `(batch_size, forecast_horizon, num_routes)`
containing the `forecast_horizon`
future values of the timeseries for each node.
Args:
data_array: np.ndarray with shape `(num_time_steps, num_routes)`
input_sequence_length: Length of the input sequence (in number of timesteps).
forecast_horizon: If `multi_horizon=True`, the target will be the values of the timeseries for 1 to
`forecast_horizon` timesteps ahead. If `multi_horizon=False`, the target will be the value of the
timeseries `forecast_horizon` steps ahead (only one value).
batch_size: Number of timeseries samples in each batch.
shuffle: Whether to shuffle output samples, or instead draw them in chronological order.
multi_horizon: See `forecast_horizon`.
Returns:
A tf.data.Dataset instance.
"""
inputs = timeseries_dataset_from_array(
data_array[:-forecast_horizon],
None,
sequence_length=input_sequence_length,
shuffle=False,
batch_size=batch_size,
)
target_offset = (
input_sequence_length
if multi_horizon
else input_sequence_length + forecast_horizon - 1
)
target_seq_length = forecast_horizon if multi_horizon else 1
targets = timeseries_dataset_from_array(
data_array[target_offset:,:,0],
None,
sequence_length=target_seq_length,
shuffle=False,
batch_size=batch_size,
)
dataset = tf.data.Dataset.zip((inputs, targets))
if shuffle:
dataset = dataset.shuffle(100)
return dataset.prefetch(16).cache()
#Build GraphInfo for input
class GraphInfo:
def __init__(self, edges: typing.Tuple[list, list], num_nodes: int):
self.edges = edges
self.num_nodes = num_nodes
#Network architecture GCN
class GraphConv(layers.Layer):
def __init__(
self,
in_feat,
out_feat,
graph_info: GraphInfo,
aggregation_type="mean",
combination_type="concat",
activation: typing.Optional[str] = None,
**kwargs,
):
super().__init__(**kwargs)
self.in_feat = in_feat
self.out_feat = out_feat
self.graph_info = graph_info
self.aggregation_type = aggregation_type
self.combination_type = combination_type
self.weight = tf.Variable(
initial_value=keras.initializers.glorot_uniform()(
shape=(in_feat, out_feat), dtype="float32"
),
trainable=True,
)
self.activation = layers.Activation(activation)
def aggregate(self, neighbour_representations: tf.Tensor):
aggregation_func = {
"sum": tf.math.unsorted_segment_sum,
"mean": tf.math.unsorted_segment_mean,
"max": tf.math.unsorted_segment_max,
}.get(self.aggregation_type)
if aggregation_func:
return aggregation_func(
neighbour_representations,
self.graph_info.edges[0],
num_segments=self.graph_info.num_nodes,
)
raise ValueError(f"Invalid aggregation type: {self.aggregation_type}")
def compute_nodes_representation(self, features: tf.Tensor):
"""Computes each node's representation.
The nodes' representations are obtained by multiplying the features tensor with
`self.weight`. Note that
`self.weight` has shape `(in_feat, out_feat)`.
Args:
features: Tensor of shape `(num_nodes, batch_size, input_seq_len, in_feat)`
Returns:
A tensor of shape `(num_nodes, batch_size, input_seq_len, out_feat)`
"""
return tf.matmul(features, self.weight)
def compute_aggregated_messages(self, features: tf.Tensor):
neighbour_representations = tf.gather(features, self.graph_info.edges[1])
aggregated_messages = self.aggregate(neighbour_representations)
return tf.matmul(aggregated_messages, self.weight)
def update(self, nodes_representation: tf.Tensor, aggregated_messages: tf.Tensor):
if self.combination_type == "concat":
h = tf.concat([nodes_representation, aggregated_messages], axis=-1)
elif self.combination_type == "add":
h = nodes_representation + aggregated_messages
else:
raise ValueError(f"Invalid combination type: {self.combination_type}.")
return self.activation(h)
def call(self, features: tf.Tensor):
"""Forward pass.
Args:
features: tensor of shape `(num_nodes, batch_size, input_seq_len, in_feat)`
Returns:
A tensor of shape `(num_nodes, batch_size, input_seq_len, out_feat)`
"""
nodes_representation = self.compute_nodes_representation(features)
aggregated_messages = self.compute_aggregated_messages(features)
return self.update(nodes_representation, aggregated_messages)
#Network architecture: LSTM
class LSTMGC(layers.Layer):
"""Layer comprising a convolution layer followed by LSTM and dense layers."""
def __init__(
self,
in_feat,
out_feat,
lstm_units: int,
input_seq_len: int,
output_seq_len: int,
graph_info: GraphInfo,
graph_conv_params: typing.Optional[dict] = None,
**kwargs,
):
super().__init__(**kwargs)
# graph conv layer
if graph_conv_params is None:
graph_conv_params = {
"aggregation_type": "mean",
"combination_type": "concat",
"activation": None,
}
self.graph_conv = GraphConv(in_feat, out_feat, graph_info, **graph_conv_params)
self.attn = layers.MultiHeadAttention(8, key_dim=100,
dropout=0.1,
kernel_regularizer=tf.keras.regularizers.l1(0.1))
self.lstm = layers.LSTM(lstm_units, activation="tanh", return_sequences=True)
self.dense = layers.Dense(output_seq_len)
self.input_seq_len, self.output_seq_len = input_seq_len, output_seq_len
def call(self, inputs, lstm_units):
"""Forward pass.
Args:
inputs: tf.Tensor of shape `(batch_size, input_seq_len, num_nodes, in_feat)`
Returns:
A tensor of shape `(batch_size, output_seq_len, num_nodes)`.
"""
lstm_units=lstm_units
# convert shape to (num_nodes, batch_size, input_seq_len, in_feat)
inputs = tf.transpose(inputs, [2, 0, 1, 3])
gcn_out = self.graph_conv(
inputs
) # gcn_out has shape: (num_nodes, batch_size, input_seq_len, out_feat)
shape = tf.shape(gcn_out)
num_nodes, batch_size, input_seq_len, out_feat = (
shape[0],
shape[1],
shape[2],
shape[3],
)
# LSTM takes only 3D tensors as input
gcn_out = tf.reshape(gcn_out, (batch_size * num_nodes, input_seq_len, out_feat))
lstm_out = self.lstm(gcn_out)
# lstm_out has shape: (batch_size * num_nodes, input_seq_len, lstm_units)
attn_out = self.attn(lstm_out, lstm_out) # attn_out has shape: (batch_size * num_nodes,input_sequence_length, lstm_units)
#reshape to 2D for dense layer
attn_out = tf.reshape(attn_out, (batch_size * num_nodes,input_sequence_length*lstm_units)) #reshaping to (batch_size*num_nodes, input_sequence_length*lstm_units)
dense_output = self.dense(attn_out) # dense_output has shape: (batch_size * num_nodes, output_seq_len, units)
output = tf.reshape(dense_output, (num_nodes, batch_size, self.output_seq_len))
return tf.transpose(
output, [1, 2, 0]
) # returns Tensor of shape (batch_size, output_seq_len, num_nodes)
#####################################################################
#Import data and reshape
adjacency_matrix = pd.read_excel(r'C:\Users\valer\Desktop\dummy_set.xlsx',
header=None, index_col=None, sheet_name="dist").to_numpy()
speeds_array = pd.read_excel(r'C:\Users\valer\Desktop\dummy_set.xlsx',
header=None, index_col=None, sheet_name="speed").to_numpy()
social = pd.read_excel(r'C:\Users\valer\Desktop\dummy_set.xlsx',
header=None, index_col=None, sheet_name="social").to_numpy()
#reshape time-dependent features
speeds_array = speeds_array.reshape(np.shape(speeds_array)[0], np.shape(speeds_array)[1],1)
social = social.reshape(np.shape(social)[0], np.shape(social)[1],1)
data_array = np.concatenate((speeds_array, social), axis=2)
print("shape of data_array",np.shape(data_array))
#Splitting and normalizing data
train_size, val_size = 0.5, 0.2
train_array, val_array, test_array = preprocess(data_array, train_size, val_size)
print(f"train set size: {train_array.shape}")
print(f"validation set size: {val_array.shape}")
print(f"test set size: {test_array.shape}")
#Create tensorflow datasets
batch_size = 64
input_sequence_length = 12
forecast_horizon = 3
multi_horizon = False
train_dataset, val_dataset = (
create_tf_dataset(data_array, input_sequence_length, forecast_horizon, batch_size)
for data_array in [train_array, val_array]
)
test_dataset = create_tf_dataset(
test_array,
input_sequence_length,
forecast_horizon,
batch_size=test_array.shape[0],
shuffle=False,
multi_horizon=multi_horizon,
)
#Create graph info
node_indices, neighbor_indices = np.where(adjacency_matrix == 1)
graph = GraphInfo(
edges=(node_indices.tolist(), neighbor_indices.tolist()),
num_nodes=adjacency_matrix.shape[0],
)
print(f"number of nodes: {graph.num_nodes}, number of edges: {len(graph.edges[0])}")
#Hyperparameter tunning with Bayesian Optimization
#Change parameters before running
#HYPERPARAMETERS: learning_rate, lstm_units
def model_builder(hp):
in_feat = np.shape(data_array)[2] ###CHANGE NUMBER OF INPUTS (VARIABLES+TARGET)
batch_size = 64
epochs = 5
input_sequence_length = 12
forecast_horizon = 3
multi_horizon = False
out_feat = 10
graph_conv_params = {
"aggregation_type": "mean",
"combination_type": "concat",
"activation": None,
}
lstm_units = hp.Int(name = 'lstm_units', min_value=32, max_value=160,step=16)
learning_rate = hp.Choice(name = 'learning_rate', values = [0.1, 0.01, 0.001, 0.0001])
#Defining layers for model
st_gcn = LSTMGC(
in_feat,
out_feat,
lstm_units,
input_sequence_length,
forecast_horizon,
graph,
graph_conv_params,
)
inputs = layers.Input((input_sequence_length, graph.num_nodes, in_feat))
outputs = st_gcn(inputs, lstm_units)
model = keras.models.Model(inputs, outputs)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
loss=keras.losses.MeanSquaredError())
return model
#Initialize tuner
tuner = kt.BayesianOptimization(
hypermodel=model_builder,
objective="val_loss",
max_trials=5,
executions_per_trial=2,
overwrite=True,
directory="my_dir",
project_name="original",
)
#search for best parameters
tuner.search(train_dataset, epochs=5, validation_data=val_dataset)
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(train_dataset, epochs=50, validation_data=val_dataset)
tuner.search(train_dataset, validation_data=val_dataset)
#Train model with best hps and best # of epochs
val_acc_per_epoch = history.history['val_loss']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(train_dataset, epochs=best_epoch, validation_data=val_dataset)
#Evaluate model
eval_result = hypermodel.evaluate(train_dataset)
print("[test loss, test accuracy]:", eval_result)
#Predictions for test set
x_test, y = next(test_dataset.as_numpy_iterator())
y_pred = hypermodel.predict(x_test)
plt.figure(figsize=(18, 6))
plt.plot(y[:, 0, 0])
plt.plot(y_pred[:, 0, 0])
plt.legend(["actual", "forecast"])
naive_mse, model_mse = (
np.square(x_test[:, -1, :, 0] - y[:, 0, :]).mean(),
np.square(y_pred[:, 0, :] - y[:, 0, :]).mean(),
)
print(f"naive MAE: {naive_mse}, model MAE: {model_mse}")
I have tried reshaping the tensor after the LSTM layer, but that did not make much sense since I need a 3D input for the attention layer.