I am training a multi-label text classifier using PyTorch with Roberta. However after 2nd epoch ram fills and kernel crashes I checked and ram is not freed after every epoch. I have 64GB RAM, 8 CPU cores
What can be the problem?
Here is the my PyTorch implementation:
class ReaderTextDataset(Dataset):
def __init__(self,
data: pd.DataFrame,
tokenizer: RobertaTokenizer,
max_token_len: int = 512):
self.tokenizer = tokenizer
self.data = data
self.max_token_len = max_token_len
def __len__(self):
return len(self.data)
def __getitem__(self, index: int):
data_row = self.data.iloc[index]
readerText = data_row.readerText
labels = data_row[LABEL_COLUMNS]
encoding = self.tokenizer.encode_plus(
readerText,
add_special_tokens=True,
max_length=self.max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
return dict(readerText=readerText,
input_ids=encoding["input_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
labels=torch.FloatTensor(labels))
train_dataset = ReaderTextDataset(train_df, tokenizer, max_token_len=512)
roberta_model = RobertaModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
NUM_WORKERS = 6
class ReaderTextDataModule(pl.LightningDataModule):
def __init__(self,
train_df,
test_df,
tokenizer,
batch_size=8,
max_token_len=512):
super().__init__()
self.batch_size = batch_size
self.train_df = train_df
self.test_df = test_df
self.tokenizer = tokenizer
self.max_token_len = max_token_len
def setup(self, stage=None):
self.train_dataset = ReaderTextDataset(self.train_df, self.tokenizer,
self.max_token_len)
self.test_dataset = ReaderTextDataset(self.test_df, self.tokenizer,
self.max_token_len)
def train_dataloader(self):
return DataLoader(self.train_dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=NUM_WORKERS)
def val_dataloader(self):
return DataLoader(self.test_dataset,
batch_size=self.batch_size,
num_workers=NUM_WORKERS)
def test_dataloader(self):
return DataLoader(self.test_dataset,
batch_size=self.batch_size,
num_workers=NUM_WORKERS)
class ReaderTextTagger(pl.LightningModule):
def __init__(self,
n_classes: int,
n_training_steps=None,
n_warmup_steps=None):
super().__init__()
self.bert = RobertaModel.from_pretrained(BERT_MODEL_NAME,
return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
self.n_training_steps = n_training_steps
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.BCELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
output = torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def training_epoch_end(self, outputs):
labels = []
predictions = []
for output in outputs:
for out_labels in output["labels"].detach().cpu():
labels.append(out_labels)
for out_predictions in output["predictions"].detach().cpu():
predictions.append(out_predictions)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
for i, name in enumerate(LABEL_COLUMNS):
class_roc_auc = auroc(predictions[:, i], labels[:, i])
self.logger.experiment.add_scalar(f"{name}_roc_auc/Train",
class_roc_auc,
self.current_epoch)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps)
return dict(optimizer=optimizer,
lr_scheduler=dict(scheduler=scheduler, interval='step'))
enter code here
You return
output
, which stores in the GPU, in everytrain_step
. Try to move it to cpu if you really need to storeoutput
by