Pytorch ram does not free after every epoch

825 Views Asked by At

I am training a multi-label text classifier using PyTorch with Roberta. However after 2nd epoch ram fills and kernel crashes I checked and ram is not freed after every epoch. I have 64GB RAM, 8 CPU cores
What can be the problem? Here is the my PyTorch implementation:

class ReaderTextDataset(Dataset):

def __init__(self,
             data: pd.DataFrame,
             tokenizer: RobertaTokenizer,
             max_token_len: int = 512):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len

def __len__(self):
    return len(self.data)

def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    readerText = data_row.readerText
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
        readerText,
        add_special_tokens=True,
        max_length=self.max_token_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return dict(readerText=readerText,
                input_ids=encoding["input_ids"].flatten(),
                attention_mask=encoding["attention_mask"].flatten(),
                labels=torch.FloatTensor(labels))

train_dataset = ReaderTextDataset(train_df, tokenizer, max_token_len=512)

roberta_model = RobertaModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

NUM_WORKERS = 6

class ReaderTextDataModule(pl.LightningDataModule):

def __init__(self,
             train_df,
             test_df,
             tokenizer,
             batch_size=8,
             max_token_len=512):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

def setup(self, stage=None):
    self.train_dataset = ReaderTextDataset(self.train_df, self.tokenizer,
                                           self.max_token_len)

    self.test_dataset = ReaderTextDataset(self.test_df, self.tokenizer,
                                          self.max_token_len)

def train_dataloader(self):
    return DataLoader(self.train_dataset,
                      batch_size=self.batch_size,
                      shuffle=True,
                      num_workers=NUM_WORKERS)

def val_dataloader(self):
    return DataLoader(self.test_dataset,
                      batch_size=self.batch_size,
                      num_workers=NUM_WORKERS)

def test_dataloader(self):
    return DataLoader(self.test_dataset,
                      batch_size=self.batch_size,
                      num_workers=NUM_WORKERS)

class ReaderTextTagger(pl.LightningModule):

def __init__(self,
             n_classes: int,
             n_training_steps=None,
             n_warmup_steps=None):
    super().__init__()
    self.bert = RobertaModel.from_pretrained(BERT_MODEL_NAME,
                                             return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

def training_epoch_end(self, outputs):

    labels = []
    predictions = []
    for output in outputs:
        for out_labels in output["labels"].detach().cpu():
            labels.append(out_labels)
        for out_predictions in output["predictions"].detach().cpu():
            predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
        class_roc_auc = auroc(predictions[:, i], labels[:, i])
        self.logger.experiment.add_scalar(f"{name}_roc_auc/Train",
                                          class_roc_auc,
                                          self.current_epoch)

def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=1e-5)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=self.n_warmup_steps,
        num_training_steps=self.n_training_steps)

    return dict(optimizer=optimizer,
                lr_scheduler=dict(scheduler=scheduler, interval='step'))

enter code here
1

There are 1 best solutions below

0
On

You return output, which stores in the GPU, in every train_step. Try to move it to cpu if you really need to store output by

return {"loss": loss, "predictions": outputs.cpu(), "labels": labels}