I'm trying to make a hybrid binary text classification model using a multi-head attention mechanism with CNN-LSTM. However, I'm facing an issue when trying to pass the values obtained from CNN-LSTM to the attention layer.
This was what I tried:
Here's the code for defining multihead attention layer.
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super().__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Model(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
This one is for positional encoding and tokenization
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super().__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
Defining the whole arhitecture
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
x = Conv1D(256, 3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = Dropout(0.5)(x)
x = LSTM(64, return_sequences=True)(x)
x = Dropout(0.5)(x)
x = LSTM(32)(x)
x = TransformerBlock(256, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
The error
StagingError Traceback (most recent call last)
<ipython-input-19-ea38c3c52d0e> in <module>
10 x = Dropout(0.5)(x)
11 x = LSTM(32)(x)
---> 12 x = TransformerBlock(256, num_heads, ff_dim)(x)
13 x = layers.GlobalAveragePooling1D()(x)
14 x = layers.Dropout(0.1)(x)
1 frames
/usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
690 except Exception as e: # pylint:disable=broad-except
691 if hasattr(e, 'ag_error_metadata'):
--> 692 raise e.ag_error_metadata.to_exception(e)
693 else:
694 raise
StagingError: Exception encountered when calling layer "transformer_block" (type TransformerBlock).
in user code:
File "<ipython-input-17-4c5de9a08c11>", line 14, in call *
attn_output = self.att(inputs, inputs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.8/dist-packages/keras/layers/activation/softmax.py", line 98, in call
return backend.softmax(inputs, axis=self.axis[0])
IndexError: Exception encountered when calling layer "softmax" (type Softmax).
tuple index out of range
Call arguments received by layer "softmax" (type Softmax):
• inputs=tf.Tensor(shape=(None, 8), dtype=float32)
• mask=None
Call arguments received by layer "transformer_block" (type TransformerBlock):
• inputs=tf.Tensor(shape=(None, 32), dtype=float32)
• training=False