How to add a multihead attention layer to a CNN-LSTM model?

658 Views Asked by At

I'm trying to make a hybrid binary text classification model using a multi-head attention mechanism with CNN-LSTM. However, I'm facing an issue when trying to pass the values obtained from CNN-LSTM to the attention layer.

This was what I tried:

Here's the code for defining multihead attention layer.

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Model(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

This one is for positional encoding and tokenization

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

Defining the whole arhitecture

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
x = Conv1D(256, 3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = Dropout(0.5)(x)
x = LSTM(64, return_sequences=True)(x)
x = Dropout(0.5)(x)
x = LSTM(32)(x)
x = TransformerBlock(256, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

The error

StagingError                              Traceback (most recent call last)

<ipython-input-19-ea38c3c52d0e> in <module>
     10 x = Dropout(0.5)(x)
     11 x = LSTM(32)(x)
---> 12 x = TransformerBlock(256, num_heads, ff_dim)(x)
     13 x = layers.GlobalAveragePooling1D()(x)
     14 x = layers.Dropout(0.1)(x)

1 frames

/usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
    690       except Exception as e:  # pylint:disable=broad-except
    691         if hasattr(e, 'ag_error_metadata'):
--> 692           raise e.ag_error_metadata.to_exception(e)
    693         else:
    694           raise

StagingError: Exception encountered when calling layer "transformer_block" (type TransformerBlock).

in user code:

    File "<ipython-input-17-4c5de9a08c11>", line 14, in call  *
        attn_output = self.att(inputs, inputs)
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.8/dist-packages/keras/layers/activation/softmax.py", line 98, in call
        return backend.softmax(inputs, axis=self.axis[0])

    IndexError: Exception encountered when calling layer "softmax" (type Softmax).
    
    tuple index out of range
    
    Call arguments received by layer "softmax" (type Softmax):
      • inputs=tf.Tensor(shape=(None, 8), dtype=float32)
      • mask=None


Call arguments received by layer "transformer_block" (type TransformerBlock):
  • inputs=tf.Tensor(shape=(None, 32), dtype=float32)
  • training=False

0

There are 0 best solutions below