I have a this PytorchLightning Code. It's a Distributed code with 3 nodes, fixed batch size, and some metric logging.
class Resnet18Model(pl.LightningModule):
def __init__(self, num_classes=10):
super().__init__()
self.model = resnet18(weights=None, num_classes=num_classes)
self.train_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=num_classes)
self.valid_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=num_classes)
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
images, labels = batch
out = self(images)
loss = F.cross_entropy(out, labels)
self.log('train_acc', self.train_acc, on_step=True, on_epoch=False)
return loss
def validation_step(self, batch, batch_idx):
images, labels = batch
out = self(images)
self.valid_acc(out, labels)
self.log('valid_acc', self.valid_acc, on_step=False, on_epoch=True)
....
def main():
model = Resnet18Model()
early_stopping = EarlyStopping(monitor="valid_acc", mode="max", stopping_threshold=0.80, patience=3)
gpus_available = torch.cuda.is_available()
if gpus_available:
devices = torch.cuda.device_count()
accelerator = "gpu"
else:
devices = multiprocessing.cpu_count()
accelerator = "cpu"
exp_name = f"resnet18_cifar10_lightning_{accelerator}_{args.batch_size}"
logger = TensorBoardLogger("logs/tb_logs", name=exp_name)
profiler = PyTorchProfiler(dirpath="logs/profiler_logs", name=exp_name,
# emit_nvtx=True,
export_to_chrome=True)
trainer = Trainer(
max_epochs=args.epochs,
accelerator=accelerator,
devices=devices,
strategy="ddp",
logger=logger,
enable_progress_bar=True,
profiler=profiler,
num_nodes=3,
log_every_n_steps=10,
callbacks=[DeviceStatsMonitor(cpu_stats=True), early_stopping]
)
trainer.fit(model)
I want to add ThroughputMonitor to be logged with the TensorBoard but I am not aware how. I read the doc and tried many variations but neither worked.
Best