How to propperly add a MultiHeadAttention keras layer to LSTM?

Question

How to propperly add a MultiHeadAttention keras layer to LSTM?

88 Views Asked by Valeria Laynes At 07 June 2025 at 05:25

I am trying to build a deep learning network (USING TENSORFLOW KERAS) that performs a graph convolution, and at each node performs an LSTM computation. I want to add a MultiHeadAttention layer to the LSTM layer, but I am not sure if I am doing it correctly since the loss explodes at the beginning of the epochs and does not achieve a better loss than the model with only the LSTM (actually, the loss is worse).

For context, I have added a Bayesian optimization for hyperparameter tuning (which is NOT the problem).

The code snippet I am having problems with is this:

class LSTMGC(layers.Layer):
    """Layer comprising a convolution layer followed by LSTM and dense layers."""

    def __init__(
        self,
        in_feat,
        out_feat,
        lstm_units: int,
        input_seq_len: int,
        output_seq_len: int,
        graph_info: GraphInfo,
        graph_conv_params: typing.Optional[dict] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        # graph conv layer
        if graph_conv_params is None:
            graph_conv_params = {
                "aggregation_type": "mean",
                "combination_type": "concat",
                "activation": None,
            }
            
        self.graph_conv = GraphConv(in_feat, out_feat, graph_info, **graph_conv_params)
        self.attn = layers.MultiHeadAttention(8, key_dim=100, 
                                              dropout=0.1, kernel_regularizer=tf.keras.regularizers.l1(0.1))
        self.lstm = layers.LSTM(lstm_units, activation="tanh", return_sequences=True)
        self.dense = layers.Dense(output_seq_len)
        self.input_seq_len, self.output_seq_len = input_seq_len, output_seq_len

    def call(self, inputs, lstm_units):
        """Forward pass.

        Args:
            inputs: tf.Tensor of shape `(batch_size, input_seq_len, num_nodes, in_feat)`

        Returns:
            A tensor of shape `(batch_size, output_seq_len, num_nodes)`.
        """
        lstm_units=lstm_units
        # convert shape to  (num_nodes, batch_size, input_seq_len, in_feat)
        inputs = tf.transpose(inputs, [2, 0, 1, 3])

        gcn_out = self.graph_conv(
            inputs
        )  # gcn_out has shape: (num_nodes, batch_size, input_seq_len, out_feat)
        shape = tf.shape(gcn_out)
        num_nodes, batch_size, input_seq_len, out_feat = (
            shape[0],
            shape[1],
            shape[2],
            shape[3],
        )

        # LSTM takes only 3D tensors as input
        gcn_out = tf.reshape(gcn_out, (batch_size * num_nodes, input_seq_len, out_feat))
        lstm_out = self.lstm(gcn_out)
        # lstm_out has shape: (batch_size * num_nodes, input_seq_len, lstm_units)
        attn_out = self.attn(lstm_out, lstm_out) # attn_out has shape: (batch_size * num_nodes,input_sequence_length, lstm_units)
        #reshape to 2D for dense layer
        attn_out = tf.reshape(attn_out, (batch_size * num_nodes,input_sequence_length*lstm_units)) #reshaping to (batch_size*num_nodes, input_sequence_length*lstm_units)
        dense_output = self.dense(attn_out) # dense_output has shape: (batch_size * num_nodes, output_seq_len, units)
        output = tf.reshape(dense_output, (num_nodes, batch_size, self.output_seq_len))
        return tf.transpose(
            output, [1, 2, 0]
        )  # returns Tensor of shape (batch_size, output_seq_len, num_nodes)

For reference, here is my complete code:

#Import libraries
import pandas as pd
import numpy as np
import typing
import matplotlib.pyplot as plt
import time

import tensorflow as tf
import keras_tuner as kt
from keras_tuner import HyperModel
from keras_tuner import BayesianOptimization
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

#Splitting and normalizing data
def preprocess(data_array: np.ndarray, train_size: float, val_size: float):
    """Splits data into train/val/test sets and normalizes the data.
    Args:
        data_array: ndarray of shape `(num_time_steps, num_routes)`
        train_size: A float value between 0.0 and 1.0 that represent the proportion of the dataset
            to include in the train split.
        val_size: A float value between 0.0 and 1.0 that represent the proportion of the dataset
            to include in the validation split.
    Returns:
        `train_array`, `val_array`, `test_array`
    """
    num_time_steps = data_array.shape[0]
    num_train, num_val = (
        int(num_time_steps * train_size),
        int(num_time_steps * val_size),
    )
    train_array = data_array[:num_train]
    mean, std = train_array.mean(axis=0), train_array.std(axis=0)
    train_array = (train_array - mean) / std
    val_array = (data_array[num_train : (num_train + num_val)] - mean) / std
    test_array = (data_array[(num_train + num_val) :] - mean) / std
    return train_array, val_array, test_array

#create TENSORFLOW Dataset
def create_tf_dataset(
    data_array: np.ndarray,
    input_sequence_length: int,
    forecast_horizon: int,
    batch_size: int = 128,
    shuffle=True,
    multi_horizon=True,
):
    """Creates tensorflow dataset from numpy array.

    This function creates a dataset where each element is a tuple `(inputs, targets)`.
    `inputs` is a Tensor
    of shape `(batch_size, input_sequence_length, num_routes, 1)` containing
    the `input_sequence_length` past values of the timeseries for each node.
    `targets` is a Tensor of shape `(batch_size, forecast_horizon, num_routes)`
    containing the `forecast_horizon`
    future values of the timeseries for each node.

    Args:
        data_array: np.ndarray with shape `(num_time_steps, num_routes)`
        input_sequence_length: Length of the input sequence (in number of timesteps).
        forecast_horizon: If `multi_horizon=True`, the target will be the values of the timeseries for 1 to
            `forecast_horizon` timesteps ahead. If `multi_horizon=False`, the target will be the value of the
            timeseries `forecast_horizon` steps ahead (only one value).
        batch_size: Number of timeseries samples in each batch.
        shuffle: Whether to shuffle output samples, or instead draw them in chronological order.
        multi_horizon: See `forecast_horizon`.

    Returns:
        A tf.data.Dataset instance.
    """

    inputs = timeseries_dataset_from_array(
        data_array[:-forecast_horizon],
        None,
        sequence_length=input_sequence_length,
        shuffle=False,
        batch_size=batch_size,
    )

    target_offset = (
        input_sequence_length
        if multi_horizon
        else input_sequence_length + forecast_horizon - 1
    )
    target_seq_length = forecast_horizon if multi_horizon else 1
    targets = timeseries_dataset_from_array(
        data_array[target_offset:,:,0],
        None,
        sequence_length=target_seq_length,
        shuffle=False,
        batch_size=batch_size,
    )

    dataset = tf.data.Dataset.zip((inputs, targets))
    if shuffle:
        dataset = dataset.shuffle(100)

    return dataset.prefetch(16).cache()

#Build GraphInfo for input
class GraphInfo:
    def __init__(self, edges: typing.Tuple[list, list], num_nodes: int):
        self.edges = edges
        self.num_nodes = num_nodes

#Network architecture GCN
class GraphConv(layers.Layer):
    def __init__(
        self,
        in_feat,
        out_feat,
        graph_info: GraphInfo,
        aggregation_type="mean",
        combination_type="concat",
        activation: typing.Optional[str] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.graph_info = graph_info
        self.aggregation_type = aggregation_type
        self.combination_type = combination_type
        self.weight = tf.Variable(
            initial_value=keras.initializers.glorot_uniform()(
                shape=(in_feat, out_feat), dtype="float32"
            ),
            trainable=True,
        )
        self.activation = layers.Activation(activation)

    def aggregate(self, neighbour_representations: tf.Tensor):
        aggregation_func = {
            "sum": tf.math.unsorted_segment_sum,
            "mean": tf.math.unsorted_segment_mean,
            "max": tf.math.unsorted_segment_max,
        }.get(self.aggregation_type)

        if aggregation_func:
            return aggregation_func(
                neighbour_representations,
                self.graph_info.edges[0],
                num_segments=self.graph_info.num_nodes,
            )

        raise ValueError(f"Invalid aggregation type: {self.aggregation_type}")

    def compute_nodes_representation(self, features: tf.Tensor):
        """Computes each node's representation.

        The nodes' representations are obtained by multiplying the features tensor with
        `self.weight`. Note that
        `self.weight` has shape `(in_feat, out_feat)`.

        Args:
            features: Tensor of shape `(num_nodes, batch_size, input_seq_len, in_feat)`

        Returns:
            A tensor of shape `(num_nodes, batch_size, input_seq_len, out_feat)`
        """
        return tf.matmul(features, self.weight)

    def compute_aggregated_messages(self, features: tf.Tensor):
        neighbour_representations = tf.gather(features, self.graph_info.edges[1])
        aggregated_messages = self.aggregate(neighbour_representations)
        return tf.matmul(aggregated_messages, self.weight)

    def update(self, nodes_representation: tf.Tensor, aggregated_messages: tf.Tensor):
        if self.combination_type == "concat":
            h = tf.concat([nodes_representation, aggregated_messages], axis=-1)
        elif self.combination_type == "add":
            h = nodes_representation + aggregated_messages
        else:
            raise ValueError(f"Invalid combination type: {self.combination_type}.")

        return self.activation(h)

    def call(self, features: tf.Tensor):
        """Forward pass.

        Args:
            features: tensor of shape `(num_nodes, batch_size, input_seq_len, in_feat)`

        Returns:
            A tensor of shape `(num_nodes, batch_size, input_seq_len, out_feat)`
        """
        nodes_representation = self.compute_nodes_representation(features)
        aggregated_messages = self.compute_aggregated_messages(features)
        return self.update(nodes_representation, aggregated_messages)

#Network architecture: LSTM
class LSTMGC(layers.Layer):
    """Layer comprising a convolution layer followed by LSTM and dense layers."""

    def __init__(
        self,
        in_feat,
        out_feat,
        lstm_units: int,
        input_seq_len: int,
        output_seq_len: int,
        graph_info: GraphInfo,
        graph_conv_params: typing.Optional[dict] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        # graph conv layer
        if graph_conv_params is None:
            graph_conv_params = {
                "aggregation_type": "mean",
                "combination_type": "concat",
                "activation": None,
            }
            
        self.graph_conv = GraphConv(in_feat, out_feat, graph_info, **graph_conv_params)
        self.attn = layers.MultiHeadAttention(8, key_dim=100, 
                                              dropout=0.1, 
                                              kernel_regularizer=tf.keras.regularizers.l1(0.1))
        self.lstm = layers.LSTM(lstm_units, activation="tanh", return_sequences=True)
        self.dense = layers.Dense(output_seq_len)
        self.input_seq_len, self.output_seq_len = input_seq_len, output_seq_len

    def call(self, inputs, lstm_units):
        """Forward pass.

        Args:
            inputs: tf.Tensor of shape `(batch_size, input_seq_len, num_nodes, in_feat)`

        Returns:
            A tensor of shape `(batch_size, output_seq_len, num_nodes)`.
        """
        lstm_units=lstm_units
        # convert shape to  (num_nodes, batch_size, input_seq_len, in_feat)
        inputs = tf.transpose(inputs, [2, 0, 1, 3])

        gcn_out = self.graph_conv(
            inputs
        )  # gcn_out has shape: (num_nodes, batch_size, input_seq_len, out_feat)
        shape = tf.shape(gcn_out)
        num_nodes, batch_size, input_seq_len, out_feat = (
            shape[0],
            shape[1],
            shape[2],
            shape[3],
        )

        # LSTM takes only 3D tensors as input
        gcn_out = tf.reshape(gcn_out, (batch_size * num_nodes, input_seq_len, out_feat))
        lstm_out = self.lstm(gcn_out)
        # lstm_out has shape: (batch_size * num_nodes, input_seq_len, lstm_units)
        attn_out = self.attn(lstm_out, lstm_out) # attn_out has shape: (batch_size * num_nodes,input_sequence_length, lstm_units)
        #reshape to 2D for dense layer
        attn_out = tf.reshape(attn_out, (batch_size * num_nodes,input_sequence_length*lstm_units)) #reshaping to (batch_size*num_nodes, input_sequence_length*lstm_units)
        dense_output = self.dense(attn_out) # dense_output has shape: (batch_size * num_nodes, output_seq_len, units)
        output = tf.reshape(dense_output, (num_nodes, batch_size, self.output_seq_len))
        return tf.transpose(
            output, [1, 2, 0]
        )  # returns Tensor of shape (batch_size, output_seq_len, num_nodes)
    
##################################################################### 

#Import data and reshape
adjacency_matrix = pd.read_excel(r'C:\Users\valer\Desktop\dummy_set.xlsx', 
                                 header=None, index_col=None, sheet_name="dist").to_numpy()
speeds_array = pd.read_excel(r'C:\Users\valer\Desktop\dummy_set.xlsx', 
                             header=None, index_col=None, sheet_name="speed").to_numpy()
social = pd.read_excel(r'C:\Users\valer\Desktop\dummy_set.xlsx', 
                       header=None, index_col=None, sheet_name="social").to_numpy()

#reshape time-dependent features
speeds_array = speeds_array.reshape(np.shape(speeds_array)[0], np.shape(speeds_array)[1],1)
social = social.reshape(np.shape(social)[0], np.shape(social)[1],1)
data_array = np.concatenate((speeds_array, social), axis=2)
print("shape of data_array",np.shape(data_array))

#Splitting and normalizing data
train_size, val_size = 0.5, 0.2
train_array, val_array, test_array = preprocess(data_array, train_size, val_size)
print(f"train set size: {train_array.shape}")
print(f"validation set size: {val_array.shape}")
print(f"test set size: {test_array.shape}")

#Create tensorflow datasets
batch_size = 64
input_sequence_length = 12
forecast_horizon = 3
multi_horizon = False

train_dataset, val_dataset = (
    create_tf_dataset(data_array, input_sequence_length, forecast_horizon, batch_size)
    for data_array in [train_array, val_array]
)

test_dataset = create_tf_dataset(
    test_array,
    input_sequence_length,
    forecast_horizon,
    batch_size=test_array.shape[0],
    shuffle=False,
    multi_horizon=multi_horizon,
)

#Create graph info
node_indices, neighbor_indices = np.where(adjacency_matrix == 1)
graph = GraphInfo(
    edges=(node_indices.tolist(), neighbor_indices.tolist()),
    num_nodes=adjacency_matrix.shape[0],
)

print(f"number of nodes: {graph.num_nodes}, number of edges: {len(graph.edges[0])}")

#Hyperparameter tunning with Bayesian Optimization
#Change parameters before running
#HYPERPARAMETERS: learning_rate, lstm_units
def model_builder(hp):
    in_feat = np.shape(data_array)[2]                       ###CHANGE NUMBER OF INPUTS (VARIABLES+TARGET)
    batch_size = 64                                   
    epochs = 5
    input_sequence_length = 12
    forecast_horizon = 3
    multi_horizon = False
    out_feat = 10
    graph_conv_params = {
        "aggregation_type": "mean",
        "combination_type": "concat",
        "activation": None,
    }
    lstm_units = hp.Int(name = 'lstm_units', min_value=32, max_value=160,step=16)
    learning_rate = hp.Choice(name = 'learning_rate', values = [0.1, 0.01, 0.001, 0.0001])
    
    #Defining layers for model
    st_gcn = LSTMGC(
        in_feat,
        out_feat,
        lstm_units,
        input_sequence_length,
        forecast_horizon,
        graph,
        graph_conv_params,
    )
    inputs = layers.Input((input_sequence_length, graph.num_nodes, in_feat))
    outputs = st_gcn(inputs, lstm_units)
    model = keras.models.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
         loss=keras.losses.MeanSquaredError())
    return model

#Initialize tuner
tuner = kt.BayesianOptimization(
    hypermodel=model_builder,
    objective="val_loss",
    max_trials=5,
    executions_per_trial=2,
    overwrite=True,
    directory="my_dir",
    project_name="original",
)

#search for best parameters
tuner.search(train_dataset, epochs=5, validation_data=val_dataset)
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)

# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(train_dataset, epochs=50, validation_data=val_dataset)
tuner.search(train_dataset, validation_data=val_dataset)

#Train model with best hps and best # of epochs
val_acc_per_epoch = history.history['val_loss']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(train_dataset, epochs=best_epoch, validation_data=val_dataset)

#Evaluate model
eval_result = hypermodel.evaluate(train_dataset)
print("[test loss, test accuracy]:", eval_result)

#Predictions for test set
x_test, y = next(test_dataset.as_numpy_iterator())
y_pred = hypermodel.predict(x_test)
plt.figure(figsize=(18, 6))
plt.plot(y[:, 0, 0])
plt.plot(y_pred[:, 0, 0])
plt.legend(["actual", "forecast"])

naive_mse, model_mse = (
    np.square(x_test[:, -1, :, 0] - y[:, 0, :]).mean(),
    np.square(y_pred[:, 0, :] - y[:, 0, :]).mean(),
)
print(f"naive MAE: {naive_mse}, model MAE: {model_mse}")

I have tried reshaping the tensor after the LSTM layer, but that did not make much sense since I need a 3D input for the attention layer.

Original Q&A

How to propperly add a MultiHeadAttention keras layer to LSTM?

There are 0 best solutions below

Related Questions in TENSORFLOW

Related Questions in KERAS

Related Questions in LSTM

Related Questions in MULTIHEAD-ATTENTION

Trending Questions

Popular # Hahtags

Popular Questions