Deep Neural Network training, why is the network training not converging?

252 Views Asked by At

I'm using MATCONVNET DagNN. Using AlexNet architecture. The last few layers of my architecture are

  [![net = dagnn.DagNN() ;
  imdb_32 =load('imdb_all_32_pd_norm.mat');
  imdb_32=imdb_32.imdb;
  % some common options
  opts.train.batchSize = 100;
  opts.train.numEpochs = 100 ;
  opts.train.continue = true ;
  opts.train.gpus = \[\] ;
  opts.train.learningRate = 0.2;%\[0.1 * ones(1,30), 0.01*ones(1,30), 0.001*ones(1,30)\] ;%0.002;%\[2e-1*ones(1, 10),  2e-2*ones(1, 5)\];
  opts.train.momentum = 0.9;
  opts.train.expDir = expDir;
  opts.train.numSubBatches = 1;

  bopts.useGpu =0;%numel(opts.train.gpus) >  0 ;

  %% NET
  net.addLayer('conv1', dagnn.Conv('size', \[11 11 3 96\], 'hasBias', true, 'stride', \[4, 4\], 'pad', \[20 20 20 20\]), {'input'}, {'conv1'},  {'conv1f'  'conv1b'});
  net.addLayer('relu1', dagnn.ReLU(), {'conv1'}, {'relu1'}, {});
  net.addLayer('lrn1', dagnn.LRN('param', \[5 1 2.0000e-05 0.7500\]), {'relu1'}, {'lrn1'}, {});
  net.addLayer('pool1', dagnn.Pooling('method', 'max', 'poolSize', \[3, 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'lrn1'}, {'pool1'}, {});

  net.addLayer('conv2', dagnn.Conv('size', \[5 5 48 256\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[2 2 2 2\]), {'pool1'}, {'conv2'},  {'conv2f'  'conv2b'});
  net.addLayer('relu2', dagnn.ReLU(), {'conv2'}, {'relu2'}, {});
  net.addLayer('lrn2', dagnn.LRN('param', \[5 1 2.0000e-05 0.7500\]), {'relu2'}, {'lrn2'}, {});
  net.addLayer('pool2', dagnn.Pooling('method', 'max', 'poolSize', \[3, 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'lrn2'}, {'pool2'}, {});
  net.addLayer('drop2',dagnn.DropOut('rate',0.7),{'pool2'},{'drop2'});

  net.addLayer('conv3', dagnn.Conv('size', \[3 3 256 384\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'drop2'}, {'conv3'},  {'conv3f'  'conv3b'});
  net.addLayer('relu3', dagnn.ReLU(), {'conv3'}, {'relu3'}, {});

  net.addLayer('conv4', dagnn.Conv('size', \[3 3 192 384\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'relu3'}, {'conv4'},  {'conv4f'  'conv4b'});
  net.addLayer('relu4', dagnn.ReLU(), {'conv4'}, {'relu4'}, {});

  net.addLayer('conv5', dagnn.Conv('size', \[3 3 192 256\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[1 1 1 1\]), {'relu4'}, {'conv5'},  {'conv5f'  'conv5b'});
  net.addLayer('relu5', dagnn.ReLU(), {'conv5'}, {'relu5'}, {});
  net.addLayer('pool5', dagnn.Pooling('method', 'max', 'poolSize', \[3 3\], 'stride', \[2 2\], 'pad', \[0 0 0 0\]), {'relu5'}, {'pool5'}, {});
  net.addLayer('drop5',dagnn.DropOut('rate',0.5),{'pool5'},{'drop5'});

  net.addLayer('fc6', dagnn.Conv('size', \[1 1 256 4096\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'drop5'}, {'fc6'},  {'conv6f'  'conv6b'});
  net.addLayer('relu6', dagnn.ReLU(), {'fc6'}, {'relu6'}, {});

  net.addLayer('fc7', dagnn.Conv('size', \[1 1 4096 4096\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'relu6'}, {'fc7'},  {'conv7f'  'conv7b'});
  net.addLayer('relu7', dagnn.ReLU(), {'fc7'}, {'relu7'}, {});
  classLabels=max(unique(imdb_32.images.labels));
  net.addLayer('classifier', dagnn.Conv('size', \[1 1 4096 1\], 'hasBias', true, 'stride', \[1, 1\], 'pad', \[0 0 0 0\]), {'relu7'}, {'prediction'},  {'conv8f'  'conv8b'});
   net.addLayer('prob', dagnn.SoftMax(), {'prediction'}, {'prob'}, {});
  net.addLayer('l2_loss', dagnn.L2Loss(), {'prob', 'label'}, {'objective'});
  net.addLayer('error', dagnn.Loss('loss', 'classerror'), {'prob','label'}, 'error') ;

  opts.colorDeviation = zeros(3) ;
  net.meta.augmentation.jitterFlip = true ;
  net.meta.augmentation.jitterLocation = true ;
  net.meta.augmentation.jitterFlip = true ;
  net.meta.augmentation.jitterBrightness = double(0.1 * opts.colorDeviation) ;
  net.meta.augmentation.jitterAspect = \[3/4, 4/3\] ;
  net.meta.augmentation.jitterScale  = \[0.4, 1.1\] ;
  net.meta.augmentation.jitterSaturation = 0.4 ;
  net.meta.augmentation.jitterContrast = 0.4 ;
  % net.meta.augmentation.jitterAspect = \[2/3, 3/2\] ;
  net.meta.normalization.averageImage=imdb_32.images.data_mean;
  initNet_He(net);

  info = cnn_train_dag(net, imdb_32, @(i,b) getBatch(bopts,i,b), opts.train, 'val', find(imdb_32.images.set == 2)) ;][1]][1]

and The result of each epoch is shown in attachment. Why isn't the error and Objective converging? The regression loss is the MSE loss. enter image description here

2

There are 2 best solutions below

0
On BEST ANSWER

For each individual conv filters' bias and initialization, the parameters have to be chosen based on application at hand. This result is due to signal fading after passing through different filters.

1
On

Try to decrease the momentum , say, to 0.5