I am using AverageMeter to calculate and sum the f1 score but it doesn't calculate correctly after each epoch is finished (or after evaluation). I used torchmetric to calculate the f1core.
from time import time
import torch
import tqdm
from utils import TBLoggerBuilder, setup_logger
import os
import numpy as np
from models.metrics import get_metric
from utils.average_meter import AverageMeter
from logging import Logger
from models.utils import save_model_check_points, save_model, best_model, early_stopping_model, init
from torchmetrics.classification import BinaryConfusionMatrix
from torchmetrics.classification import BinaryStatScores
import pandas as pd
import torchmetrics
def evaluation(conf,
model, criterion, val_loader, device,
inference, debug
):
print('\nValidating...')
confmat = BinaryConfusionMatrix()
stats = BinaryStatScores()
model.eval() # Optional when not using Model Specific layer
y_pred = []
y_true = []
f1_score = torchmetrics.classification.BinaryF1Score()
loss_avg = AverageMeter(name='loss')
f1_avg = AverageMeter(name='f1')
with torch.no_grad():
with tqdm.tqdm(val_loader, unit="batch") as tepoch:
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(device)
target = target.to(device)
# out = model(image)['out']
# if unet == true => remove ['out']
out = model(image)
# Calculate Loss
target = target.unsqueeze(1)
target = target.float()
loss = criterion(out, target)
# Calculate Loss
loss_avg.update(loss.item(), image.size(0))
f1 = f1_score(out.cpu(), target.cpu())
f1_avg.update(f1.item())
y_pred.extend(out) # Save Prediction
y_true.extend(target)
tepoch.set_postfix(loss=loss_avg.avg,
f_score=f1_avg.avg)
if debug:
break
if inference:
probs = torch.sigmoid(torch.tensor(y_pred))
predicted_vals = probs > 0.5
ground_truth_vals = [False if i == 0 else True for i in np.array(y_true)]
df_result = pd.DataFrame({
'prediction': np.array(y_pred),
'predicted_probs': np.array(probs),
'ground_truth': np.array(y_true),
'predicted_vals': np.array(predicted_vals),
'ground_truth_vals': np.array(ground_truth_vals),
})
df_result.to_csv('../outputs/results.csv')
y_pred = torch.tensor(y_pred)
y_true = torch.tensor(y_true)
print(f'\nConfusionMatrix: \n{confmat(y_pred, y_true).cpu().numpy()}')
stats_scores = stats(y_pred, y_true)
print(f'\ntp: {stats_scores[0]}, fp: {stats_scores[1]},'
f' tn: {stats_scores[2]}, fn: {stats_scores[3]},'
f' sup: {stats_scores[4]}')
print(f1_score(y_pred,y_true),f1_avg.avg)
return loss_avg.avg, f1_avg.avgF
def train_one_epoch(conf, epoch, model, optimizer, criterion, train_loader, device, debug):
model.train()
# f1_score = get_metric(conf=conf.metric)
f1_score = torchmetrics.classification.BinaryF1Score()
loss_avg = AverageMeter(name='loss')
f1_avg = AverageMeter(name='f1')
with tqdm.tqdm(train_loader, unit="batch") as tepoch:
tepoch.set_description(f"Epoch {epoch}")
for image, target in tepoch:
# Transfer Data to GPU if available
image = image.to(device)
target = target.to(device)
# Forward Pass
# out = model(image)['out']
# if unet == true => remove ['out']
out = model(image)
# Find the Loss
target = target.unsqueeze(1)
target = target.float()
loss = criterion(out, target)
# Calculate Loss
loss_avg.update(loss.item(), image.size(0))
# Clear the gradients
optimizer.zero_grad()
# Calculate gradients
loss.backward()
# Update Weights
optimizer.step()
f1 = f1_score(out.cpu(), target.cpu()).item()
f1_avg.update(f1)
tepoch.set_postfix(loss=loss_avg.avg,
f_score=f1)
if debug:
break
return loss_avg.avg, f1_avg.avg
def fit(conf,
model,
optimizer,
scheduler,
criterion,
train_loader,
val_loader,
start_epoch=0,
device='cpu',
logger: Logger = None):
tb_logger = TBLoggerBuilder(cfg=conf.base).setup()
model_check_point_path = conf.base['model_check_point_path']
best_model_path = conf.base['best_model_path']
epochs, debug, early_stopping, checkpoint_step, best_loss = init(
cfg=conf)
model = model.to(device)
if logger is not None:
logger.info("\nStart training\n")
start_training_time = time()
for e in range(start_epoch, epochs):
start_training_epoch_time = time()
train_losses_avg, train_f_score_avg = train_one_epoch(conf=conf,
epoch=e,
model=model,
optimizer=optimizer,
criterion=criterion,
train_loader=train_loader,
device=device,
debug=debug)
if tb_logger is not None:
tb_logger.log(log_type='criterion/training', value=train_losses_avg, epoch=e)
tb_logger.log(log_type='f_score/training', value=train_f_score_avg, epoch=e)
end_training_epoch_time = time() - start_training_epoch_time
print('\n')
if logger is not None:
logger.info(
f'Training Results - [{end_training_epoch_time:.3f}s] Epoch: {e}:'
f' f_score: {train_f_score_avg:.3f},'
f' Loss: {train_losses_avg:.3f}\n')
# validation step
start_validation_epoch_time = time()
val_losses_avg, val_f_score_avg = evaluation(conf=conf,
model=model,
criterion=criterion,
val_loader=val_loader, device=device,
inference=False,
debug=debug)
if tb_logger is not None:
tb_logger.log(log_type='criterion/validation', value=val_losses_avg, epoch=e)
tb_logger.log(log_type='f_score/validation', value=val_f_score_avg, epoch=e)
end_validation_epoch_time = time() - start_validation_epoch_time
if logger is not None:
print('\n')
logger.info(
f'validation Results - [{end_validation_epoch_time:.3f}s] Epoch: {e}:'
f' f_score: {val_f_score_avg:.3f},'
f' Loss: {val_losses_avg:.3f}\n')
# early stopping
early_stopping, stop = early_stopping_model(epoch=e,
best_loss=best_loss,
val_loss=val_losses_avg,
early_stopping=early_stopping)
if stop:
break
# model check points
checkpoint_step = save_model_check_points(path=model_check_point_path,
checkpoint_step=checkpoint_step,
epoch=e,
model=model,
optimizer=optimizer,
scheduler=scheduler,
loss=criterion,
avg_loss=train_losses_avg)
# apply scheduler
if scheduler:
if conf.scheduler['name'] == 'reduce_lr_on_plateau':
scheduler.step(val_losses_avg)
else:
scheduler.step()
if logger is not None:
logger.info(f"Current learning rate is {optimizer.param_groups[0]['lr']}")
if tb_logger is not None:
tb_logger.log(log_type='lr', value=optimizer.param_groups[0]['lr'], epoch=e)
# save best model
if best_model_path:
best_loss = best_model(model=model, best_loss=best_loss,
val_loss=val_losses_avg, path=best_model_path)
if tb_logger is not None:
tb_logger.flush()
if debug:
break
# end of epocha
end_training_time = time() - start_training_time
print(f'Finished Training after {end_training_time:.3f}s')
if tb_logger is not None:
tb_logger.close()
this is the Averagemeter class
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
The result is for example:
Validating...
100%|██████████| 65/65 [01:07<00:00, 1.04s/batch, f_score=0.553, loss=0.0522]
ConfusionMatrix:
[[520 5]
[ 14 493]]
tp: 493, fp: 5, tn: 520, fn: 14, sup: 507
tensor(0.9811) 0.5534822574028602
As you see, the result in validation progress bar is f_score=0.553
while the f1score after the end of validation is tensor(0.9811)
So, it seems getting average from sum of f1score in batch loop is not working.
I solve the problem by using
f1_score.compute().item()
. I understand that when we are using torchmetrics, there is a method that compute the metric on all batches using custom accumulation. So, it doesn't need to use AverageMeter to hold the values and compute the average of scores.