In this code they have concatenated the visual embedding with the text embedding
vis_embeddings = self.visualLayer(
visual_embd.view(-1, self.config.monet_hidden_size)
).view(input_shape[0], 200, self.config.hidden_size)
assert vis_embeddings.shape[1] == visual_embd.shape[1]
# vis_embeddings = vis_embeddings + self.frame_embeddings(torch.tensor(self.frames).unsqueeze(0).to(inputs_embeds.device))
# concat
embeddings = torch.cat((inputs_embeds, vis_embeddings), dim=1)
embeddings = embeddings + self.token_type_embeddings(token_type_ids)
embeddings = torch.cat((cls_embedding, embeddings), dim=1)
if self.position_embedding_type == "absolute":
# print('error')
position_embeddings = self.position_embeddings(position_ids)
# print(position_embeddings.shape, embeddings.shape)
# assert position_embeddings.shape==embeddings.shape
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
Can this code be replicated in t5?