Why the same configuration network in caffe and pytorch behaves so differently?

Question

Why the same configuration network in caffe and pytorch behaves so differently?

465 Views Asked by 不爱吃猫的鱼 At 06 January 2018 at 09:44

the code in pytorch is here, I only used the vgg19 arch.

In order to make the cifar10 dataset preprocessed same in caffe and pytorch, I remove all the transforms in main.py but toTensor(). I found the cifar10 dataset's range is [0,1] in pytorch, but caffe is [0,255], so I scale 1/255 below. Any extra preprocessing is canceled to make things easier.

here is my caffe net definition prototxt:

layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  transform_param {
    scale: 0.00392156862745
  }
  data_param {
    source: "/home/snk/Documents/digits-jobs/20180106-135745-4cac/train_db"
    batch_size: 128
    backend: LMDB
  }
}
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }
  transform_param {
    scale: 0.00392156862745
  }
  data_param {
    source: "/home/snk/Documents/digits-jobs/20180106-135745-4cac/val_db"
    batch_size: 128
    backend: LMDB
  }
}
layer {
  name: "conv1_1"
  type: "Convolution"
  bottom: "data"
  top: "conv1_1"
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu1_1"
  type: "ReLU"
  bottom: "conv1_1"
  top: "conv1_1"
}
layer {
  name: "conv1_2"
  type: "Convolution"
  bottom: "conv1_1"
  top: "conv1_2"
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu1_2"
  type: "ReLU"
  bottom: "conv1_2"
  top: "conv1_2"
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1_2"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv2_1"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2_1"
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu2_1"
  type: "ReLU"
  bottom: "conv2_1"
  top: "conv2_1"
}
layer {
  name: "conv2_2"
  type: "Convolution"
  bottom: "conv2_1"
  top: "conv2_2"
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu2_2"
  type: "ReLU"
  bottom: "conv2_2"
  top: "conv2_2"
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "conv2_2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv3_1"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3_1"
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu3_1"
  type: "ReLU"
  bottom: "conv3_1"
  top: "conv3_1"
}
layer {
  name: "conv3_2"
  type: "Convolution"
  bottom: "conv3_1"
  top: "conv3_2"
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu3_2"
  type: "ReLU"
  bottom: "conv3_2"
  top: "conv3_2"
}
layer {
  name: "conv3_3"
  type: "Convolution"
  bottom: "conv3_2"
  top: "conv3_3"
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu3_3"
  type: "ReLU"
  bottom: "conv3_3"
  top: "conv3_3"
}
layer {
  name: "conv3_4"
  type: "Convolution"
  bottom: "conv3_3"
  top: "conv3_4"
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu3_4"
  type: "ReLU"
  bottom: "conv3_4"
  top: "conv3_4"
}
layer {
  name: "pool3"
  type: "Pooling"
  bottom: "conv3_4"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv4_1"
  type: "Convolution"
  bottom: "pool3"
  top: "conv4_1"
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu4_1"
  type: "ReLU"
  bottom: "conv4_1"
  top: "conv4_1"
}
layer {
  name: "conv4_2"
  type: "Convolution"
  bottom: "conv4_1"
  top: "conv4_2"
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu4_2"
  type: "ReLU"
  bottom: "conv4_2"
  top: "conv4_2"
}
layer {
  name: "conv4_3"
  type: "Convolution"
  bottom: "conv4_2"
  top: "conv4_3"
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu4_3"
  type: "ReLU"
  bottom: "conv4_3"
  top: "conv4_3"
}
layer {
  name: "conv4_4"
  type: "Convolution"
  bottom: "conv4_3"
  top: "conv4_4"
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu4_4"
  type: "ReLU"
  bottom: "conv4_4"
  top: "conv4_4"
}
layer {
  name: "pool4"
  type: "Pooling"
  bottom: "conv4_4"
  top: "pool4"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv5_1"
  type: "Convolution"
  bottom: "pool4"
  top: "conv5_1"
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu5_1"
  type: "ReLU"
  bottom: "conv5_1"
  top: "conv5_1"
}
layer {
  name: "conv5_2"
  type: "Convolution"
  bottom: "conv5_1"
  top: "conv5_2"
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu5_2"
  type: "ReLU"
  bottom: "conv5_2"
  top: "conv5_2"
}
layer {
  name: "conv5_3"
  type: "Convolution"
  bottom: "conv5_2"
  top: "conv5_3"
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu5_3"
  type: "ReLU"
  bottom: "conv5_3"
  top: "conv5_3"
}
layer {
  name: "conv5_4"
  type: "Convolution"
  bottom: "conv5_3"
  top: "conv5_4"
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu5_4"
  type: "ReLU"
  bottom: "conv5_4"
  top: "conv5_4"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5_4"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "drop6"
  type: "Dropout"
  bottom: "pool5"
  top: "drop_pool5"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc6"
  type: "InnerProduct"
  bottom: "drop_pool5"
  top: "fc6"
  inner_product_param {
    num_output: 512
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
}
layer {
  name: "drop7"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc7"
  type: "InnerProduct"
  bottom: "fc6"
  top: "fc7"
  inner_product_param {
    num_output: 512
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "relu7"
  type: "ReLU"
  bottom: "fc7"
  top: "fc7"
}
layer {
  name: "fc8"
  type: "InnerProduct"
  bottom: "fc7"
  top: "fc8"
  inner_product_param {
    num_output: 10
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "acc"
  type: "Accuracy"
  bottom: "fc8"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "fc8"
  bottom: "label"
  top: "loss"
}

Here is my solver.prototxt

test_iter: 79
test_interval: 391
base_lr: 0.05
display: 40
max_iter: 117300
lr_policy: "step"
gamma: 0.5
momentum: 0.9
weight_decay: 0.0005
stepsize: 11730
snapshot: 3910
snapshot_prefix: "snapshot"
solver_mode: GPU
net: "train_val.prototxt"
solver_type: SGD

The fact is pytorch can train the model at a very quick speed, the acc after one epoch is about 20%, but in caffe the loss is always around 2.3XXX (about -log(0.1), random guess loss), I suspect it was because the different weight initialization, so I changed caffe's filler.hpp's xavier to 'efficient backprop'(i.e U(-std, std), std=1/(sqrt(fan_in))), but it didn't work.

Now, the only difference is the bias initialization method, in pytorch it uses the weight's fan_in, but in caffe I think it uses the output_num as fan_in (because its shape is [1,N], N is the number of output neurons, and in filler.hpp it use blob.count() / blob.num() as fan_in for xavier ).

Who can help me ? I think if all the configuration is the same, the training process is almost the same ,but it breaked my opinion.

Original Q&A

Why the same configuration network in caffe and pytorch behaves so differently?

There are 0 best solutions below

Related Questions in CAFFE

Related Questions in PYTORCH

Related Questions in NVIDIA-DIGITS

Trending Questions

Popular # Hahtags

Popular Questions