I have encountered an issue with the following error message while running my code:

Traceback (most recent call last):
  File "train.py", line 76, in <module>
    model.optimize_parameters()
  File "/root/autodl-tmp/code/pytorch-CycleGAN-and-pix2pix-master/models/pix2pix_model.py", line 284, in optimize_parameters
    self.scaler.step(self.optimizer_G)
  File "/root/miniconda3/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 336, in step
    assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer."
AssertionError: No inf checks were recorded for this optimizer.

I have made numerous attempts to resolve this issue, including checking for NaN or Inf values in the output, and using with torch.autograd.detect_anomaly(): to debug, but I couldn't find any errors. Strangely, when using detect_anomaly, no errors are reported, but the problem persists with the same error as before.

I suspect that the problem might be related to the usage of automatic mixed precision (AMP) and torch.cuda.amp.GradScaler() in my code. I am using AMP to speed up training on a GPU. The error occurs during the optimization step (self.scaler.step(self.optimizer_G)) of the Pix2Pix model.

I would greatly appreciate any assistance or insights into resolving this issue, as it has become quite frustrating. If needed, I can provide more code snippets or details about how I'm using AMP and the optimizer in my code.

import torch
from .base_model import BaseModel
from . import networks
from torch.cuda.amp import autocast, GradScaler


class Pix2PixModel(BaseModel):
    """ This class implements the pix2pix model, for learning a mapping from input images to output images given paired data.

    The model training requires '--dataset_mode aligned' dataset.
    By default, it uses a '--netG unet256' U-Net generator,
    a '--netD basic' discriminator (PatchGAN),
    and a '--gan_mode' vanilla GAN loss (the cross-entropy objective used in the orignal GAN paper).

    pix2pix paper: https://arxiv.org/pdf/1611.07004.pdf
    """
    @staticmethod
    def modify_commandline_options(parser, is_train=True):
        """Add new dataset-specific options, and rewrite default values for existing options.

        Parameters:
            parser          -- original option parser
            is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.

        Returns:
            the modified parser.

        For pix2pix, we do not use image buffer
        The training objective is: GAN Loss + lambda_L1 * ||G(A)-B||_1
        By default, we use vanilla GAN loss, UNet with batchnorm, and aligned datasets.
        """
        # changing the default values to match the pix2pix paper (https://phillipi.github.io/pix2pix/)
        parser.set_defaults(norm='batch', netG='unet_256', dataset_mode='semi')
        if is_train:
            parser.set_defaults(pool_size=0, gan_mode='vanilla')
            parser.add_argument('--lambda_L1', type=float, default=100.0, help='weight for L1 loss')

        return parser

    def __init__(self, opt):
        """Initialize the pix2pix class.

        Parameters:
            opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
        """
        BaseModel.__init__(self, opt)
        
        
        # 开启混合精度计算
        self.scaler = GradScaler()
        # specify the training losses you want to print out. The training/test scripts will call <BaseModel.get_current_losses>
        # self.loss_names = ['G_GAN', 'G_L1', 'D_real', 'D_fake']
        self.loss_names = ['D_fake', 'D_real', 'PD_pred_fake', 'PD_pred_real', 'D_G_fake', 'PD_G_fake']
        self.loss_names_supervised = ['D_fake_supervised', 'D_real_supervised', 'PD_pred_fake_supervised', 'PD_pred_real_supervised', 'D_G_fake_supervised', 'PD_G_fake_supervised', 'G_L1']
        # specify the images you want to save/display. The training/test scripts will call <BaseModel.get_current_visuals>

        # self.visual_names = ['real_A', 'fake_B', 'real_B']
        self.visual_names = ['G_ulbed_rgb', 'ulbed_rgb']
        self.visual_names_supervised = ['G_lbed_rgb', 'lbed_rgb', 'lbed_gt']

        # get the images_path
        self.image_paths = ''       #################################这里需要修改##############################

        # specify the models you want to save to the disk. The training/test scripts will call <BaseModel.save_networks> and <BaseModel.load_networks>
        if self.isTrain:
            self.model_names = ['G', 'D',"PD"]
        else:  # during test time, only load G
            self.model_names = ['G']
        # define networks (both generator and discriminator)

        self.netG = networks.define_G_semi(init_type=opt.init_type, init_gain=opt.init_gain, gpu_ids=self.gpu_ids)
        # self.netG = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf, opt.netG, opt.norm,
        #                               not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids)

        if self.isTrain:  # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc
            # self.netD = networks.define_D(opt.input_nc + opt.output_nc, opt.ndf, opt.netD,
            #                               opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids)
            self.netD = networks.define_D(opt.output_nc, opt.ndf, opt.netD,
                                          opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids)
            self.netPD = networks.define_D(4, opt.ndf*2, opt.netD,
                                          opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids)

        if self.isTrain:
            # define loss functions
            self.criterionGAN = networks.GANLoss(opt.gan_mode).to(self.device)
            self.criterionL1 = torch.nn.L1Loss()
            # initialize optimizers; schedulers will be automatically created by function <BaseModel.setup>.
            self.optimizer_G = torch.optim.Adam(self.netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
            self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
            self.optimizer_PD = torch.optim.Adam(self.netPD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
            self.optimizers.append(self.optimizer_G)
            self.optimizers.append(self.optimizer_D)
            self.optimizers.append(self.optimizer_PD)

    def set_input(self, input):
        """Unpack input data from the dataloader and perform necessary pre-processing steps.

        Parameters:
            input (dict): include the data itself and its metadata information.

        The option 'direction' can be used to swap images in domain A and domain B.
        """
        # AtoB = self.opt.direction == 'AtoB'
        # self.real_A = input['A' if AtoB else 'B'].to(self.device)
        # self.real_B = input['B' if AtoB else 'A'].to(self.device)
        # self.image_paths = input['A_paths' if AtoB else 'B_paths']

        # self.ulbed_depth = input['ulbed_depth'].to(self.device)
        self.ulbed_rgb = input['ulbed_rgb'].to(self.device)
        # self.lbed_depth = input['lbed_depth'].to(self.device)
        self.lbed_rgb = input['lbed_rgb'].to(self.device)
        self.lbed_gt = input['lbed_gt'].to(self.device)
        # self.ulbed_depth_rgb = torch.concatenate([self.ulbed_depth, self.ulbed_rgb],1)

    def forward(self):
        """Run forward pass; called by both functions <optimize_parameters> and <test>."""
        # self.fake_B = self.netG(self.ulbed_depth_rgb)  # G(A)
        with autocast():
            self.G_ulbed_rgb = self.netG(self.ulbed_rgb)

    def forward_supervised(self):
        # self.G_ulbed_rgb = self.netG(self.ulbed_rgb)
        with autocast():
            self.G_lbed_rgb = self.netG(self.lbed_rgb)

    def backward_D(self):
        """Calculate GAN loss for the discriminator"""
        # Fake; stop backprop to the generator by detaching fake_B
        # fake_AB = torch.cat((self.real_A, self.fake_B), 1)  # we use conditional GANs; we need to feed both input and output to the discriminator
        # pred_fake = self.netD(fake_AB.detach())
        with torch.autograd.detect_anomaly():
            with autocast():
                pred_fake = self.netD(self.G_ulbed_rgb.detach())
                self.loss_D_fake = self.criterionGAN(pred_fake, False)
                # Real
                # real_AB = torch.cat((self.real_A, self.real_B), 1)
                pred_real = self.netD(self.lbed_gt.detach())
                self.loss_D_real = self.criterionGAN(pred_real, True)
                # combine loss and calculate gradients
                self.loss_D = (self.loss_D_fake + self.loss_D_real) * 0.5
            self.scaler.scale(self.loss_D).backward()
        # self.loss_D.backward()
    
    def backward_D_supervised(self):
        """Calculate GAN loss for the discriminator"""
        # Fake; stop backprop to the generator by detaching fake_B
        # fake_AB = torch.cat((self.real_A, self.fake_B), 1)  # we use conditional GANs; we need to feed both input and output to the discriminator
        # pred_fake = self.netD(fake_AB.detach())
        with autocast():
            pred_fake = self.netD(self.G_lbed_rgb.detach())
            self.loss_D_fake_supervised = self.criterionGAN(pred_fake, False)
            
            # Real
            # real_AB = torch.cat((self.real_A, self.real_B), 1)
            pred_real = self.netD(self.lbed_gt.detach())
            self.loss_D_real_supervised = self.criterionGAN(pred_real, True)
            # combine loss and calculate gradients
            self.loss_D_supervised = (self.loss_D_fake_supervised + self.loss_D_real_supervised) * 0.5
        self.scaler.scale(self.loss_D_supervised).backward()
        # self.loss_D_supervised.backward()

    def backward_PD(self):
        # fake
        with torch.autograd.detect_anomaly():
            with autocast():
                fake = torch.cat((self.G_ulbed_rgb,  self.ulbed_rgb), 1)
                pred_fake = self.netPD(fake.detach())
                self.loss_PD_pred_fake = self.criterionGAN(pred_fake, False)
        
                # real
                real = torch.cat((self.lbed_gt, self.lbed_rgb), 1)
                pred_real = self.netPD(real.detach())
                self.loss_PD_pred_real = self.criterionGAN(pred_real, True)
                self.loss_PD = (self.loss_PD_pred_fake + self.loss_PD_pred_real) * 0.5
            self.scaler.scale(self.loss_PD).backward()
        if torch.isinf(self.loss_PD).any():
            print("self.loss_PD梯度中包含inf值")
        else:
            print("self.loss_PD梯度中没有inf值")
        if torch.isnan(self.loss_PD).any():
            print("self.loss_PD梯度中包含isnan值")
        else:
            print("self.loss_PD梯度中没有isnan值")
        # self.loss_PD.backward()

    def backward_PD_supervised(self):
        with autocast():
            fake = torch.cat((self.G_lbed_rgb, self.lbed_rgb), 1)
            pred_fake = self.netPD(fake.detach())
            self.loss_PD_pred_fake_supervised = self.criterionGAN(pred_fake, False)
    
            real = torch.cat((self.lbed_gt, self.lbed_rgb), 1)
            pred_real = self.netPD(real.detach())
            self.loss_PD_pred_real_supervised = self.criterionGAN(pred_real, True)
            self.loss_PD_supervised = (self.loss_PD_pred_fake_supervised + self.loss_PD_pred_real_supervised) * 0.5
        self.scaler.scale(self.loss_PD_supervised).backward()
        # self.loss_PD_supervised.backward()


    # def backward_G(self):
    #     """Calculate GAN and L1 loss for the generator"""
    #     # First, G(A) should fake the discriminator
    #     fake_AB = torch.cat((self.real_A, self.fake_B), 1)
    #     pred_fake = self.netD(fake_AB)
    #     self.loss_G_GAN = self.criterionGAN(pred_fake, True)
    #     # Second, G(A) = B
    #     self.loss_G_L1 = self.criterionL1(self.fake_B, self.real_B) * self.opt.lambda_L1
    #     # combine loss and calculate gradients
    #     self.loss_G = self.loss_G_GAN + self.loss_G_L1
    #     self.loss_G.backward()

    def backward_G(self):
        """Calculate GAN and L1 loss for the generator"""
        # First, G(A) should fake the discriminator
        with torch.autograd.detect_anomaly():
            with autocast():
                pred_fake_1 = self.netD(self.G_ulbed_rgb.detach())
                self.loss_D_G_fake = self.criterionGAN(pred_fake_1, True)
                fake = torch.cat((self.G_ulbed_rgb,  self.ulbed_rgb), 1)
                pred_fake_2 = self.netPD(fake.detach())
                self.loss_PD_G_fake = self.criterionGAN(pred_fake_2, True)
                self.loss_G = self.loss_D_G_fake + self.loss_PD_G_fake
                # Second, G(A) = B
            # combine loss and calculate gradients
            # self.loss_G = self.loss_G_GAN + self.loss_G_L1
            # self.loss_G.requires_grad = True
            self.scaler.scale(self.loss_G).backward()
        if torch.isinf(self.loss_G).any():
            print("self.loss_G梯度中包含inf值")
        else:
            print("self.loss_G梯度中没有inf值")
        if torch.isnan(self.loss_G).any():
            print("self.loss_G梯度中包含isnan值")
        else:
            print("self.loss_G梯度中没有isnan值")

        # self.loss_G.backward()

    def backward_G_supervised(self):
        """Calculate GAN and L1 loss for the generator"""
        # First, G(A) should fake the discriminator
        with autocast():
            pred_fake_1 = self.netD(self.G_lbed_rgb.detach())
            self.loss_D_G_fake_supervised = self.criterionGAN(pred_fake_1, True)
    
            fake = torch.cat((self.G_lbed_rgb,  self.lbed_rgb), 1)
            pred_fake_2 = self.netPD(fake.detach())
            self.loss_PD_G_fake_supervised = self.criterionGAN(pred_fake_2, True)
    
            self.loss_G_L1 = self.criterionL1(self.G_lbed_rgb, self.lbed_gt) * self.opt.lambda_L1
            self.loss_G_supervised = self.loss_D_G_fake_supervised + self.loss_PD_G_fake_supervised + self.loss_G_L1
        # Second, G(A) = B
        # combine loss and calculate gradients
        # self.loss_G = self.loss_G_GAN + self.loss_G_L1
        self.scaler.scale(self.loss_G_supervised).backward()
        # self.loss_G_supervised.backward()

    def optimize_parameters(self):
        self.forward()                   # compute fake images: G(A)
        # update D
        self.set_requires_grad(self.netD, True)  # enable backprop for D
        self.set_requires_grad(self.netPD, False)
        self.optimizer_D.zero_grad()     # set D's gradients to zero
        self.backward_D()                # calculate gradients for D
        self.scaler.step(self.optimizer_D)
        self.scaler.update()
        # self.optimizer_D.step()          # update D's weights
        # ipdate PD
        self.set_requires_grad(self.netPD, True)  # enable backprop for D
        self.set_requires_grad(self.netD, False)
        self.optimizer_PD.zero_grad()     # set D's gradients to zero
        self.backward_PD()                # calculate gradients for D
        self.scaler.step(self.optimizer_PD)
        self.scaler.update()
        # self.optimizer_PD.step()          # update D's weights
        # update G
        # self.set_requires_grad(self.netD, False)  # D requires no gradients when optimizing G
        # self.set_requires_grad(self.netPD, False)
        self.optimizer_G.zero_grad()        # set G's gradients to zero
        self.backward_G()                   # calculate graidents for G
        # self.optimizer_G.step()
        # self.scaler._has_inf_or_nan = True
        # print(len(self.optimizer_G.optimizer_state["found_inf_per_device"]))
        self.scaler.step(self.optimizer_G)
        self.scaler.update()
        # self.optimizer_G.step()             # update G's weights

    def optimize_parameters_supervised(self):
        self.forward_supervised()                 # compute fake images: G(A)
        # update D
        self.set_requires_grad(self.netD, True)   # enable backprop for D
        self.set_requires_grad(self.netPD, False)
        self.optimizer_D.zero_grad()     # set D's gradients to zero
        self.backward_D_supervised()              # calculate gradients for D
        self.scaler.step(self.optimizer_D)
        self.scaler.update()
        # self.optimizer_D.step()          # update D's weights
        # update PD
        self.set_requires_grad(self.netPD, True)  # enable backprop for D
        self.set_requires_grad(self.netD, False)
        self.optimizer_PD.zero_grad()     # set D's gradients to zero
        self.backward_PD_supervised()             # calculate gradients for D
        self.scaler.step(self.optimizer_PD)
        self.scaler.update()
        # self.optimizer_PD.step()          # update D's weights
        # update G
        self.set_requires_grad(self.netD, False)  # D requires no gradients when optimizing G
        self.set_requires_grad(self.netPD, False)
        self.optimizer_G.zero_grad()        # set G's gradients to zero
        self.backward_G_supervised()              # calculate graidents for G
        self.scaler.step(self.optimizer_G)
        self.scaler.update()
        # self.optimizer_G.step()             # update G's weights

I have made numerous attempts to resolve this issue, including checking for NaN or Inf values in the output, and using with torch.autograd.detect_anomaly(): to debug, but I couldn't find any errors. Strangely, when using detect_anomaly, no errors are reported, but the problem persists with the same error as before.

I suspect that the problem might be related to the usage of automatic mixed precision (AMP) and torch.cuda.amp.GradScaler() in my code. I am using AMP to speed up training on a GPU. The error occurs during the optimization step (self.scaler.step(self.optimizer_G)) of the Pix2Pix model

0

There are 0 best solutions below