Two implementation of virtual batch norm lead to two differents results

510 Views Asked by At

Introductory question:

should both example code lead to different behavior on training (any loss / any optimizer) ?

# first code
inputs1 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
inputs2 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
full_inputs = tf.concat([inputs1, inputs2], axis=0)
with tf.variable_scope('convnet'):
     outputs = tf.nn.conv2d(inputs, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')

# second code
inputs1 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
inputs2 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
full_inputs = tf.concat([inputs1, inputs2], axis=0)

with tf.variable_scope('convnet'):
    outputs1 = tf.nn.conv2d(inputs1, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')

with tf.variable_scope('convnet', reuse=True):
    outputs2 = tf.nn.conv2d(inputs2, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')

outputs = tf.concat([outputs1, outputs2], axis=0)

My real case scenario:

I'm trying to implement virtual batch norm, I have two implementations which doesn't behave the same way widely inspired by the improved gan repository. Both implementation shown here are simplified to keep mostly what is different between them.

First implementation:

@add_arg_scope
def vbn_single(x, epsilon=1e-5, scope=None):
    assert isinstance(epsilon, float)
    shape = x.get_shape().as_list()
    if shape[0] is None:
        half_size = x.shape[0] // 2
    else:
        half_size = shape[0] // 2
    needs_reshape = len(shape) != 4
    if needs_reshape:
        orig_shape = shape
        if len(shape) == 2:
            x = tf.reshape(x, [shape[0], shape[1], 0, 0])
        elif len(shape) == 1:
            x = tf.reshape(x, [shape[0], 1, 1, 1])
        else:
            assert False, shape
        shape = x.get_shape().as_list()
    batch_size = int(x.get_shape()[0])
    with tf.variable_scope(scope, 'VBN'):
        ref_half = tf.slice(x, [0,0,0,0], [half_size, shape[1], \
                            shape[2], shape[3]])
        gamma = tf.get_variable("gamma", [1,shape[1],1,1],
                    initializer=tf.constant_initializer(1.))
        beta = tf.get_variable("beta", [1,shape[1],1,1],
                    initializer=tf.constant_initializer(0.))
        ref_mean, ref_var = tf.nn.moments(ref_half, [0,2,3], \
                                          keep_dims=True)
        inv_std = tf.rsqrt(ref_var + epsilon)
        coeff = inv_std * gamma
    return (x * coeff) + (beta - ref_mean * coeff)

inputs = tf.placeholder(shape=[32, 1, 256, 256], dtype=tf.float32)
reference_batch = tf.get_variable('reference_batch', initializer=reference_array)
full_inputs = tf.concat([reference_batch, inputs], axis=0)
L = []
with tf.variable_scope('convnet'):
    L.append(tf.contrib.layers.conv2d(inputs, [...], \
                                      scope='Layer0'))
    L.append(vbn_single(L[-1], scope='Norm0'))
    L.append(tf.nn.relu(L[-1], name='Activ0')
    L.append(tf.contrib.layers.conv2d(L[-1], [...], \
                                      scope='Layer1'))
    L.append(vbn_single(L[-1], scope='Norm1'))
    L.append(tf.nn.relu(L[-1], name='Activq')
    L.append(tf.contrib.layers.conv2d(L[-1], [...], \
                                      scope='Layer2'))
    L.append(vbn_single(L[-1], scope='Norm2'))
    L.append(tf.nn.relu(L[-1], name='Activ2')
shape = L[-1].get_shape().as_list()
half_size = shape[0] // 2
L.append(tf.slice(L[-1], [half_size,0,0,0], \
              [half_size, shape[1], shape[2], shape[3]]))
L.append(tf.reduce_mean(L[-1], axis=[2,3]))
L.append(tf.contrib.layers.fully_connected(L[-1], num_outputs=2))
# loss accuracy and optimizer

Everything seems to work ok, validation and training accuracy converge up, and loss going down.

Second implementation

class Vbn_double(object):
    def __init__(self, x, epsilon=1e-5, scope=None):
        shape = x.get_shape().as_list()
        needs_reshape = len(shape) != 4
        if needs_reshape:
            orig_shape = shape
            if len(shape) == 2:
                if data_format == 'NCHW':
                    x = tf.reshape(x, [shape[0], shape[1], 0, 0])
                else:
                    x = tf.reshape(x, [shape[0], 1, 1, shape[1]])
            elif len(shape) == 1:
                x = tf.reshape(x, [shape[0], 1, 1, 1])
            else:
                assert False, shape
            shape = x.get_shape().as_list()
        with tf.variable_scope(scope):
            self.epsilon = epsilon
            self.scope = scope
            self.mean, self.var = tf.nn.moments(x, [0,2,3], \
                                                keep_dims=True)
            self.inv_std = tf.rsqrt(self.var + epsilon)
            self.batch_size = int(x.get_shape()[0])
            out = self._normalize(x, self.mean, self.inv_std)
            if needs_reshape:
                out = tf.reshape(out, orig_shape)
            self.reference_output = out

    def __call__(self, x):
        shape = x.get_shape().as_list()
        needs_reshape = len(shape) != 4
        if needs_reshape:
            orig_shape = shape
            if len(shape) == 2:
                if self.data_format == 'NCHW':
                    x = tf.reshape(x, [shape[0], shape[1], 0, 0])
                else:
                    x = tf.reshape(x, [shape[0], 1, 1, shape[1]])
            elif len(shape) == 1:
                x = tf.reshape(x, [shape[0], 1, 1, 1])
            else:
                assert False, shape
        with tf.variable_scope(self.scope, reuse=True):
            out = self._normalize(x, self.mean, self.inv_std)
            if needs_reshape:
                out = tf.reshape(out, orig_shape)
        return out

    def _normalize(self, x, mean, inv_std):
        shape = x.get_shape().as_list()
        assert len(shape) == 4
        gamma = tf.get_variable("gamma", [1,shape[1],1,1],
                        initializer=tf.constant_initializer(1.))
        beta = tf.get_variable("beta", [1,shape[1],1,1],
                        initializer=tf.constant_initializer(0.))
        coeff = gamma * inv_std
        return (x * coeff) + (beta - mean * coeff)

inputs = tf.placeholder(shape=[32, 1, 256, 256], dtype=tf.float32)
reference_batch = tf.get_variable('reference_batch', initializer=reference_array)
L = []
vbn = {}
with tf.variable_scope('convnet'):
    L.append(tf.contrib.layers.conv2d(reference_batch, [...], \
                                      scope='Layer0'))
    vbn['Norm0'] = Vbn_double(L[-1], scope='Norm0')
    L.append(vbn['Norm0'].reference_output)
    L.append(tf.nn.relu(L[-1], name='Activ0')
    L.append(tf.contrib.layers.conv2d(L[-1], [...], \
                                      scope='Layer1'))
    vbn['Norm1'] = Vbn_double(L[-1], scope='Norm1')
    L.append(vbn['Norm1'].reference_output)
    L.append(tf.nn.relu(L[-1], name='Activ1')
    L.append(tf.contrib.layers.conv2d(L[-1], [...], \
                                      scope='Layer2'))
    vbn['Norm2'] = Vbn_double(L[-1], scope='Norm2')
    L.append(vbn['Norm2'].reference_output)
    L.append(tf.nn.relu(L[-1], name='Activ2')

with tf.variable_scope('convnet', reuse=True):
    L.append(tf.contrib.layers.conv2d(inputs, [...], \
                                      scope='Layer0'))
    L.append(vbn['Norm0'](L[-1]))
    L.append(tf.nn.relu(L[-1], name='Activ0')
    L.append(tf.contrib.layers.conv2d(L[-1], [...], \
                                      scope='Layer1'))
    L.append(vbn['Norm1'](L[-1]))
    L.append(tf.nn.relu(L[-1], name='Activ1')
    L.append(tf.contrib.layers.conv2d(L[-1], [...], \
                                      scope='Layer2'))
    L.append(vbn['Norm2'](L[-1]))
    L.append(tf.nn.relu(L[-1], name='Activ2')
L.append(tf.reduce_mean(L[-1], axis=[2,3]))
L.append(tf.contrib.layers.fully_connected(L[-1], num_outputs=2))
# loss accuracy and optimizer

Here only training converge (but with slightly different curve than first implementation), while validation loss increase and accuracy stay at random guess.

As a matter of details, I'm using GPU, tensorflow 1.2.1 with XLA enabled. Any clue of what I'm doing wrong ?

Edit:

So I tried to compare both output models, and also look at the gradients (using compute_gradients), to avoid weights (and then gradients) sharing I built the models in two different scopes and load separately the same weights (from a previously trained model) on both models.

I have same output if I just use:

sess.run([model.outputs, model2.outputs])

but if I also look at the gradients (first element of each tuple return by Optimizer.compute_gradients(loss)) at the same time using:

sess.run([model.outputs, model2.outputs, grads])

suddenly the model outputs are differents... How can the model output change just by looking at the gradients without using apply_gradients ? Also it doesn't seems to have changed the weights because if i'm running:

sess.run(grads)
sess.run([model.outputs, model.outputs2])

the model outputs are still the same...

1

There are 1 best solutions below

0
On

Ok, it seems that XLA is buggy here, as I have consistent results after disabling XLA. It seems that XLA can't handle something on the 2nd implementation...

I will raise an issue later on the repository about this, 'compute_gradients' modifying the outputs being particularly disturbing...