I'm doing Distillation from a Roberta with an Adapter, I'm following this tutorial
and in the function distill_roberta_weights()
I just change teacher_model.config.to_dict()
to student.load_state_dict(teacher.state_dict(), strict=False)
, so the student model has the adapter too.
But when I am training the distillation using the
DistillationTrainer
from here
I get the following error
Do you have any idea of what is the problem? The student_output has a loss generator instead the tensor, the part of the cross entropy does not have any problem as it uses the logits from the outputs.
EDIT:
I am adding more information
def distill_weights(teacher, student):
"""
Recursively copies the weights of the (teacher) to the (student).
This function is meant to be first called on a RobertaFor... model, but is then called on every children of that model recursively.
The only part that's not fully copied is the encoder, of which only half is copied.
"""
# If the part is an entire RoBERTa model or a RobertaFor..., unpack and iterate
if isinstance(teacher, RobertaModel) or type(teacher).__name__.startswith('RobertaFor'):
for teacher_part, student_part in zip(teacher.children(), student.children()):
distill_weights(teacher_part, student_part)
# Else if the part is an encoder, copy one out of every layer
elif isinstance(teacher, RobertaEncoder):
teacher_encoding_layers = [layer for layer in next(teacher.children())]
student_encoding_layers = [layer for layer in next(student.children())]
for i in range(len(student_encoding_layers)):
student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2*i].state_dict())
# Else the part is a head or something else, copy the state_dict
else:
student.load_state_dict(teacher.state_dict(), strict=False)
def distill_roberta_based(teacher_model):
"""
Distilates a RoBERTa (teacher_model) like would DistilBERT for a BERT model.
The student model has the same configuration, except for the number of hidden layers, which is // by 2.
The student layers are initilized by copying one out of two layers of the teacher, starting with layer 0.
The head of the teacher is also copied.
"""
# Set student configuration
configuration = teacher_model.config.to_dict()
configuration['num_hidden_layers'] //= 2
configuration = RobertaConfig.from_dict(configuration)
# create student model
student_model = type(teacher_model)(configuration)
distill_weights(teacher=teacher_model, student=student_model)
return student_model
#function for train the Distillated model
class DistillationTrainer(Trainer):
def __init__(self, *args, teacher_model=None, **kwargs):
super().__init__(*args, **kwargs)
self.teacher = teacher_model
# place teacher on same device as student
self._move_model_to_device(self.teacher,self.model.device)
self.teacher.eval()
def compute_loss(self, model, inputs, return_outputs = False) :
"""
The distillation loss for distilating a BERT-like model.
The loss takes the (teacher_logits), (student_logits) and (labels) for various losses.
The (temperature) can be given, otherwise it's set to 1 by default.
"""
outputs_student = model(**inputs)
print(outputs_student)
student_loss = outputs_student.loss
# compute teacher output
with torch.no_grad():
outputs_teacher = self.teacher(**inputs)
# assert size
assert outputs_student.logits.size() == outputs_teacher.logits.size()
# Classification loss (problem-specific loss)
loss_function = CrossEntropyLoss()
# Temperature and sotfmax
student_logits = F.softmax (outputs_student.logits / self.args.temperature, dim=-1)
teacher_logits = F.softmax (outputs_teacher.logits / self.args.temperature, dim=-1)
loss_logits = loss_function(student_logits, teacher_logits)
# Return weighted student loss
loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
return (loss, outputs_student) if return_outputs else loss
#create the student
student_model_adapter = distill_roberta_based(teacher_model)
#activate adapter
student_model_adapter.set_active_adapters('parallel')
student_model_adapter.train_adapter('parallel')
trainer = DistillationTrainer(
student_model_adapter,
training_args,
teacher_model=teacher_model,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.args._n_gpu = 4
So, the desired output of outputs_student
should be like
SequenceClassifierOutput(loss=tensor([0.6899, 0.6902, 0.6926, 0.6913, 0.6906, 0.6904, 0.6922, 0.6917],
device='cuda:0', grad_fn=<GatherBackward>), logits=tensor([[-1.2512e-03, -9.7885e-03],
[ 6.2714e-03, -5.7755e-03],.....])
But instead the output is
SequenceClassifierOutput(loss=<generator object gather.<locals>.gather_map.<locals>.<genexpr> at 0x7f5bb4fbe9d0>, logits=tensor([[-0.0150, 0.0075],
[-0.0122, 0.0181],...