I'm trying to train a deep learning model in Azure Notebook which uses GPU from the DSVM-Ubuntu 18.04 which consists of Standard NC6 (6 vcpus, 56 GiB memory) and is getting the following Error:
RuntimeError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 11.17 GiB total capacity; 10.76 GiB already allocated; 50.31 MiB free; 10.84 GiB reserved in total by PyTorch)
I've searched on this regard and couldn't find any solution in any of the questions on the web. And '10.84 GiB reserved in total by PyTorch' in the error message caught my attention, whether this can be configured to have a low memory value? I would like to receive any opinions in this regard. thank you.
This is my Code for Fine-tuning/training
for epoch in range(EPOCHS):
for idx,article in tqdm_notebook(enumerate(article_loader)):
article_tens = torch.tensor(tokenizer.encode(article[0], max_length=1024)).unsqueeze(0).to(device)
outputs = model(article_tens, labels=article_tens)
train_loss, prediction_scores = outputs[:2]
train_loss.backward()
train_sum_loss = train_sum_loss + train_loss.detach().data
iteration_count=idx
article_count = article_count + 1
if article_count == BATCH_SIZE:
article_count = 0
batch_count += 1
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
Whole Stack-trace of the error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-11-2c74a22e42f7> in <module>
20 article_tens = torch.tensor(tokenizer.encode(article[0], max_length=1024)).unsqueeze(0).to(device)
21
---> 22 outputs = model(article_tens, labels=article_tens)
23
24 train_loss, prediction_scores = outputs[:2]
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, input_ids, past, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, use_cache)
602 head_mask=head_mask,
603 inputs_embeds=inputs_embeds,
--> 604 use_cache=use_cache,
605 )
606 hidden_states = transformer_outputs[0]
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, input_ids, past, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, use_cache)
486 attention_mask=attention_mask,
487 head_mask=head_mask[i],
--> 488 use_cache=use_cache,
489 )
490
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, x, layer_past, attention_mask, head_mask, use_cache)
240
241 x = x + a
--> 242 m = self.mlp(self.ln_2(x))
243 x = x + m
244
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/modeling_gpt2.py in forward(self, x)
215
216 def forward(self, x):
--> 217 h = self.act(self.c_fc(x))
218 h2 = self.c_proj(h)
219 return self.dropout(h2)
/anaconda/envs/py37_pytorch/lib/python3.7/site-packages/transformers/activations.py in gelu_new(x)
27 Also see https://arxiv.org/abs/1606.08415
28 """
---> 29 return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
30
31
RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 11.17 GiB total capacity; 10.74 GiB already allocated; 320.00 KiB free; 10.89 GiB reserved in total by PyTorch)