SequenceClassifierOutput has generator as loss instead of a tensor

543 Views Asked by At

I'm doing Distillation from a Roberta with an Adapter, I'm following this tutorial

and in the function distill_roberta_weights() I just change teacher_model.config.to_dict() to student.load_state_dict(teacher.state_dict(), strict=False), so the student model has the adapter too.

But when I am training the distillation using the

DistillationTrainer

from here I get the following error enter image description here

Do you have any idea of what is the problem? The student_output has a loss generator instead the tensor, the part of the cross entropy does not have any problem as it uses the logits from the outputs.

EDIT:

I am adding more information

def distill_weights(teacher, student):
   """
   Recursively copies the weights of the (teacher) to the (student).
   This function is meant to be first called on a RobertaFor... model, but is then called on every children of that model recursively.
   The only part that's not fully copied is the encoder, of which only half is copied.
   """
   # If the part is an entire RoBERTa model or a RobertaFor..., unpack and iterate
   if isinstance(teacher, RobertaModel) or type(teacher).__name__.startswith('RobertaFor'):
       for teacher_part, student_part in zip(teacher.children(), student.children()):
           distill_weights(teacher_part, student_part)
   # Else if the part is an encoder, copy one out of every layer
   elif isinstance(teacher, RobertaEncoder):
           teacher_encoding_layers = [layer for layer in next(teacher.children())]
           student_encoding_layers = [layer for layer in next(student.children())]
           for i in range(len(student_encoding_layers)):
               student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2*i].state_dict())
   # Else the part is a head or something else, copy the state_dict
   else:
       student.load_state_dict(teacher.state_dict(), strict=False)


def distill_roberta_based(teacher_model):
   """
   Distilates a RoBERTa (teacher_model) like would DistilBERT for a BERT model.
   The student model has the same configuration, except for the number of hidden layers, which is // by 2.
   The student layers are initilized by copying one out of two layers of the teacher, starting with layer 0.
   The head of the teacher is also copied.
   """
   # Set student configuration
   configuration = teacher_model.config.to_dict()
   configuration['num_hidden_layers'] //= 2
   configuration = RobertaConfig.from_dict(configuration)
   
   # create student model
   student_model = type(teacher_model)(configuration)
   distill_weights(teacher=teacher_model, student=student_model)

   return student_model
#function for train the Distillated model
class DistillationTrainer(Trainer):
   def __init__(self, *args, teacher_model=None, **kwargs):
       super().__init__(*args, **kwargs)
       
       self.teacher = teacher_model
       # place teacher on same device as student
       self._move_model_to_device(self.teacher,self.model.device)
       self.teacher.eval()

   
   def compute_loss(self, model, inputs, return_outputs = False)  :
       """
       The distillation loss for distilating a BERT-like model.
       The loss takes the (teacher_logits), (student_logits) and (labels) for various losses.
       The (temperature) can be given, otherwise it's set to 1 by default.
       """
       outputs_student =  model(**inputs)
       print(outputs_student)
       student_loss    = outputs_student.loss
       
       
       # compute teacher output
       with torch.no_grad():
         outputs_teacher = self.teacher(**inputs)
       
       # assert size
       assert outputs_student.logits.size() == outputs_teacher.logits.size()
                                

       # Classification loss (problem-specific loss)
       loss_function = CrossEntropyLoss()
       
       # Temperature and sotfmax
       student_logits = F.softmax (outputs_student.logits / self.args.temperature, dim=-1)
       teacher_logits = F.softmax (outputs_teacher.logits / self.args.temperature, dim=-1)
       loss_logits = loss_function(student_logits, teacher_logits)

       # Return weighted student loss
       loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
       return (loss, outputs_student) if return_outputs else loss

#create the student 
student_model_adapter = distill_roberta_based(teacher_model)
#activate adapter 
student_model_adapter.set_active_adapters('parallel')
student_model_adapter.train_adapter('parallel')  

trainer = DistillationTrainer(
   student_model_adapter,
   training_args,
   teacher_model=teacher_model,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)
trainer.args._n_gpu = 4

So, the desired output of outputs_student should be like

SequenceClassifierOutput(loss=tensor([0.6899, 0.6902, 0.6926, 0.6913, 0.6906, 0.6904, 0.6922, 0.6917],
       device='cuda:0', grad_fn=<GatherBackward>), logits=tensor([[-1.2512e-03, -9.7885e-03],
        [ 6.2714e-03, -5.7755e-03],.....])

But instead the output is

SequenceClassifierOutput(loss=<generator object gather.<locals>.gather_map.<locals>.<genexpr> at 0x7f5bb4fbe9d0>, logits=tensor([[-0.0150,  0.0075],
        [-0.0122,  0.0181],...
0

There are 0 best solutions below