Introductory question:
should both example code lead to different behavior on training (any loss / any optimizer) ?
# first code
inputs1 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
inputs2 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
full_inputs = tf.concat([inputs1, inputs2], axis=0)
with tf.variable_scope('convnet'):
outputs = tf.nn.conv2d(inputs, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')
# second code
inputs1 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
inputs2 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
full_inputs = tf.concat([inputs1, inputs2], axis=0)
with tf.variable_scope('convnet'):
outputs1 = tf.nn.conv2d(inputs1, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')
with tf.variable_scope('convnet', reuse=True):
outputs2 = tf.nn.conv2d(inputs2, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')
outputs = tf.concat([outputs1, outputs2], axis=0)
My real case scenario:
I'm trying to implement virtual batch norm, I have two implementations which doesn't behave the same way widely inspired by the improved gan repository. Both implementation shown here are simplified to keep mostly what is different between them.
First implementation:
@add_arg_scope
def vbn_single(x, epsilon=1e-5, scope=None):
assert isinstance(epsilon, float)
shape = x.get_shape().as_list()
if shape[0] is None:
half_size = x.shape[0] // 2
else:
half_size = shape[0] // 2
needs_reshape = len(shape) != 4
if needs_reshape:
orig_shape = shape
if len(shape) == 2:
x = tf.reshape(x, [shape[0], shape[1], 0, 0])
elif len(shape) == 1:
x = tf.reshape(x, [shape[0], 1, 1, 1])
else:
assert False, shape
shape = x.get_shape().as_list()
batch_size = int(x.get_shape()[0])
with tf.variable_scope(scope, 'VBN'):
ref_half = tf.slice(x, [0,0,0,0], [half_size, shape[1], \
shape[2], shape[3]])
gamma = tf.get_variable("gamma", [1,shape[1],1,1],
initializer=tf.constant_initializer(1.))
beta = tf.get_variable("beta", [1,shape[1],1,1],
initializer=tf.constant_initializer(0.))
ref_mean, ref_var = tf.nn.moments(ref_half, [0,2,3], \
keep_dims=True)
inv_std = tf.rsqrt(ref_var + epsilon)
coeff = inv_std * gamma
return (x * coeff) + (beta - ref_mean * coeff)
inputs = tf.placeholder(shape=[32, 1, 256, 256], dtype=tf.float32)
reference_batch = tf.get_variable('reference_batch', initializer=reference_array)
full_inputs = tf.concat([reference_batch, inputs], axis=0)
L = []
with tf.variable_scope('convnet'):
L.append(tf.contrib.layers.conv2d(inputs, [...], \
scope='Layer0'))
L.append(vbn_single(L[-1], scope='Norm0'))
L.append(tf.nn.relu(L[-1], name='Activ0')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer1'))
L.append(vbn_single(L[-1], scope='Norm1'))
L.append(tf.nn.relu(L[-1], name='Activq')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer2'))
L.append(vbn_single(L[-1], scope='Norm2'))
L.append(tf.nn.relu(L[-1], name='Activ2')
shape = L[-1].get_shape().as_list()
half_size = shape[0] // 2
L.append(tf.slice(L[-1], [half_size,0,0,0], \
[half_size, shape[1], shape[2], shape[3]]))
L.append(tf.reduce_mean(L[-1], axis=[2,3]))
L.append(tf.contrib.layers.fully_connected(L[-1], num_outputs=2))
# loss accuracy and optimizer
Everything seems to work ok, validation and training accuracy converge up, and loss going down.
Second implementation
class Vbn_double(object):
def __init__(self, x, epsilon=1e-5, scope=None):
shape = x.get_shape().as_list()
needs_reshape = len(shape) != 4
if needs_reshape:
orig_shape = shape
if len(shape) == 2:
if data_format == 'NCHW':
x = tf.reshape(x, [shape[0], shape[1], 0, 0])
else:
x = tf.reshape(x, [shape[0], 1, 1, shape[1]])
elif len(shape) == 1:
x = tf.reshape(x, [shape[0], 1, 1, 1])
else:
assert False, shape
shape = x.get_shape().as_list()
with tf.variable_scope(scope):
self.epsilon = epsilon
self.scope = scope
self.mean, self.var = tf.nn.moments(x, [0,2,3], \
keep_dims=True)
self.inv_std = tf.rsqrt(self.var + epsilon)
self.batch_size = int(x.get_shape()[0])
out = self._normalize(x, self.mean, self.inv_std)
if needs_reshape:
out = tf.reshape(out, orig_shape)
self.reference_output = out
def __call__(self, x):
shape = x.get_shape().as_list()
needs_reshape = len(shape) != 4
if needs_reshape:
orig_shape = shape
if len(shape) == 2:
if self.data_format == 'NCHW':
x = tf.reshape(x, [shape[0], shape[1], 0, 0])
else:
x = tf.reshape(x, [shape[0], 1, 1, shape[1]])
elif len(shape) == 1:
x = tf.reshape(x, [shape[0], 1, 1, 1])
else:
assert False, shape
with tf.variable_scope(self.scope, reuse=True):
out = self._normalize(x, self.mean, self.inv_std)
if needs_reshape:
out = tf.reshape(out, orig_shape)
return out
def _normalize(self, x, mean, inv_std):
shape = x.get_shape().as_list()
assert len(shape) == 4
gamma = tf.get_variable("gamma", [1,shape[1],1,1],
initializer=tf.constant_initializer(1.))
beta = tf.get_variable("beta", [1,shape[1],1,1],
initializer=tf.constant_initializer(0.))
coeff = gamma * inv_std
return (x * coeff) + (beta - mean * coeff)
inputs = tf.placeholder(shape=[32, 1, 256, 256], dtype=tf.float32)
reference_batch = tf.get_variable('reference_batch', initializer=reference_array)
L = []
vbn = {}
with tf.variable_scope('convnet'):
L.append(tf.contrib.layers.conv2d(reference_batch, [...], \
scope='Layer0'))
vbn['Norm0'] = Vbn_double(L[-1], scope='Norm0')
L.append(vbn['Norm0'].reference_output)
L.append(tf.nn.relu(L[-1], name='Activ0')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer1'))
vbn['Norm1'] = Vbn_double(L[-1], scope='Norm1')
L.append(vbn['Norm1'].reference_output)
L.append(tf.nn.relu(L[-1], name='Activ1')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer2'))
vbn['Norm2'] = Vbn_double(L[-1], scope='Norm2')
L.append(vbn['Norm2'].reference_output)
L.append(tf.nn.relu(L[-1], name='Activ2')
with tf.variable_scope('convnet', reuse=True):
L.append(tf.contrib.layers.conv2d(inputs, [...], \
scope='Layer0'))
L.append(vbn['Norm0'](L[-1]))
L.append(tf.nn.relu(L[-1], name='Activ0')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer1'))
L.append(vbn['Norm1'](L[-1]))
L.append(tf.nn.relu(L[-1], name='Activ1')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer2'))
L.append(vbn['Norm2'](L[-1]))
L.append(tf.nn.relu(L[-1], name='Activ2')
L.append(tf.reduce_mean(L[-1], axis=[2,3]))
L.append(tf.contrib.layers.fully_connected(L[-1], num_outputs=2))
# loss accuracy and optimizer
Here only training converge (but with slightly different curve than first implementation), while validation loss increase and accuracy stay at random guess.
As a matter of details, I'm using GPU, tensorflow 1.2.1 with XLA enabled. Any clue of what I'm doing wrong ?
Edit:
So I tried to compare both output models, and also look at the gradients (using compute_gradients), to avoid weights (and then gradients) sharing I built the models in two different scopes and load separately the same weights (from a previously trained model) on both models.
I have same output if I just use:
sess.run([model.outputs, model2.outputs])
but if I also look at the gradients (first element of each tuple return by Optimizer.compute_gradients(loss)) at the same time using:
sess.run([model.outputs, model2.outputs, grads])
suddenly the model outputs are differents... How can the model output change just by looking at the gradients without using apply_gradients ? Also it doesn't seems to have changed the weights because if i'm running:
sess.run(grads)
sess.run([model.outputs, model.outputs2])
the model outputs are still the same...
Ok, it seems that XLA is buggy here, as I have consistent results after disabling XLA. It seems that XLA can't handle something on the 2nd implementation...
I will raise an issue later on the repository about this, 'compute_gradients' modifying the outputs being particularly disturbing...