The CodeBert model always generates the same output

73 Views Asked by At

I am trying to use a CodeBert model followed by a CNN for multi-class classification of vulnerabilities in source code. I tried to test the outputs of the CodeBert model on an example.

model_name = "microsoft/codebert-base"
codebert_model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
codebert_model.to(device)
batch_texts = ["Example ", "OtherExample", "This is another example "]
batch_encoded = tokenizer.batch_encode_plus(batch_texts, padding=True, truncation=True, return_tensors="pt")
prova1 = batch_encoded["input_ids"]
prova2 = batch_encoded["attention_mask"]
output_prova =codebert_model(input_ids=prova1.to(device),attention_mask=prova2.to(device)).last_hidden_state
print(output_prova)

This is the output:

tensor([[[-0.1492,  0.3261,  0.0464,  ..., -0.2080, -0.3364,  0.3195],
         [-0.3576,  0.2826,  0.3289,  ..., -0.0565, -0.7721,  0.1333],
         [ 0.2045, -0.3244,  0.2059,  ...,  0.0232, -0.5977,  0.0806],
         ...,
         [ 0.1698,  0.1308,  0.3582,  ..., -0.2559, -0.0660,  0.3568],
         [ 0.1698,  0.1308,  0.3582,  ..., -0.2559, -0.0660,  0.3568],
         [ 0.1698,  0.1308,  0.3582,  ..., -0.2559, -0.0660,  0.3568]],

        [[-0.1296,  0.3715,  0.0631,  ..., -0.1635, -0.2994,  0.3208],
         [-0.1772,  0.9282,  0.4182,  ...,  0.0147, -0.2448,  0.2903],
         [-0.1238,  0.4330,  0.2896,  ..., -0.0967, -0.6715,  0.5703],
         ...,
         [-0.4116,  0.2910,  0.2929,  ..., -0.6710, -0.2761,  0.3889],
         [-0.4116,  0.2910,  0.2929,  ..., -0.6710, -0.2761,  0.3889],
         [-0.4116,  0.2910,  0.2929,  ..., -0.6710, -0.2761,  0.3889]],

        [[-0.1394,  0.3415,  0.0495,  ..., -0.1913, -0.3411,  0.2758],
         [-0.2208,  0.6902,  0.5890,  ..., -0.1302, -0.5507,  0.3460],
         [-0.1908,  0.7800,  0.4150,  ..., -0.5183, -0.5432,  0.1034],
         ...,
         [-0.1698,  0.2102,  0.5444,  ..., -0.0313, -0.7477,  0.4945],
         [ 0.0994,  0.0209,  0.2717,  ..., -0.0929, -0.6063,  0.3174],
         [-0.1391,  0.3422,  0.0502,  ..., -0.1915, -0.3420,  0.2762]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

I then created the model by inserting the CodeBert model before my CNN

import torch.nn.functional as F
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class MyCNN(nn.Module):
    def __init__(self, codebert_model, input_size, output_size):
        super(MyCNN, self).__init__()

        # Aggiungi il modello Codebert
        self.codebert = codebert_model

        # Aggiungi il layer di convoluzione 1D
        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=200, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=input_size, out_channels=200, kernel_size=4)
        self.conv3 = nn.Conv1d(in_channels=input_size, out_channels=200, kernel_size=5)


        # Aggiungi il layer di attivazione ReLU
        self.dropout = nn.Dropout(0.5)

        # Aggiungi il layer di max pooling
        self.fc1 = nn.Linear(200*3,256) #500
        self.fc2 = nn.Linear(256,128)
        self.fc3 = nn.Linear(128,12)

    def forward(self, x , attention_mask):
        # Estrai gli embedding di Codebert
        codebert_output = self.codebert(x, attention_mask=attention_mask).last_hidden_state
        x = codebert_output.permute(0, 2, 1)

        # Esegui la parte di CNN
        #x = codebert_output.permute(0, 2, 1)
        x1 = F.relu(self.conv1(x))
        x2 = F.relu(self.conv2(x))
        x3 = F.relu(self.conv3(x))

        x1 = F.max_pool1d(x1, x1.shape[2])
        x2 = F.max_pool1d(x2, x2.shape[2])
        x3 = F.max_pool1d(x3, x3.shape[2])

        x = torch.cat([x1,x2,x3],dim=1)

        # flatten the tensor
        x = x.flatten(1)

        # apply mean over the last dimension
        #x = torch.mean(x, -1)

        x = self.dropout(x)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return(x)

I built a Dataload that returns the source code and the label associated with the cwe. And I wrote the training function.

print('Training started.....')
model.to(device)
EPOCHS=20
running_acc = 0
running_loss = 0
dim_batch = 4
BEST_VAL_ACC = 0
timer = time.time()
for e in range(EPOCHS):
  train,val=split_training(train_encodings_119,train_encodings_20,train_encodings_787,train_encodings_125,train_encodings_416,train_encodings_399,train_encodings_200,train_encodings_476,train_encodings_190,train_encodings_264,train_encodings_189,train_encodings_Other)
  train_data = CustomDataset(train)
  val_data  = CustomDataset(val)
  train_iterator = DataLoader(train_data, batch_size=dim_batch, shuffle=False)
  valid_iterator = DataLoader(val_data, batch_size=dim_batch, shuffle=False)
  i=0
  for batch in train_iterator:
    i =  i+1
    code, target = batch
    batch_encoded = tokenizer.batch_encode_plus(code, padding="max_length", truncation=True, return_tensors="pt", max_length=500)
    input_ids = batch_encoded["input_ids"].to(device)
    attention_mask = batch_encoded["attention_mask"].to(device)
    target = target.long().to(device)
    optimizer.zero_grad()
    output = model(input_ids,attention_mask=attention_mask)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    acc = multiclass_accuracy(output,target)
    running_acc += acc
    running_loss += loss.item()
  with torch.no_grad():
        model.eval()
        running_acc_val = 0
        running_loss_val = 0
        for batch in valid_iterator:
          code, target = batch
          batch_encoded = tokenizer.batch_encode_plus(code, padding="max_length", truncation=True, return_tensors="pt", max_length=500)
          input_ids = batch_encoded["input_ids"].to(device)
          attention_mask = batch_encoded["attention_mask"].to(device)
          target = target.long().to(device)
          output_val = model(input_ids,attention_mask=attention_mask)
          loss_val = criterion(output_val,target)
          acc_val = multiclass_accuracy(output_val,target)
          running_acc_val += acc_val
          running_loss_val += loss_val.item()
  print_out = "Epoch %d - Training acc: %.4f -Training loss: %.4f - Val acc: %.4f - Val loss: %.4f - Time: %.4fs \n" % (e+1,
  running_acc/len(train_iterator),
  running_loss/len(train_iterator),
  running_acc_val/len(valid_iterator),
  running_loss_val/len(valid_iterator),
  (time.time()-timer))
  if(running_acc_val/len(valid_iterator) > BEST_VAL_ACC):
    BEST_VAL_ACC = running_acc_val/len(valid_iterator)
    model_save_path = "Encodings/model-multiclass.pth"
    torch.save(model.state_dict(), model_save_path)

print('Training completed!')

I have some pretty strange results. But the strangest thing is that it seems that the CodeBert model always outputs the same values. In fact, if I try to relaunch the first example of the question, it generates three tensors that are all equal to each other. This is the output of the example piece of code written at the beginning of the question after launching the training function.

tensor([[[ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         ...,
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111]],

        [[ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         ...,
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111]],

        [[ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         ...,
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111],
         [ 0.0545,  0.1427, -0.0886,  ..., -0.1833,  0.0855,  0.0111]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

I don't understand where the problem is, the input_ids and attention_mask are the correct size and actually change for each batch. However, the output always remains the same. Can someone help me? I've tried changing everything but I just can't figure it out.

0

There are 0 best solutions below