Hi stack overflow community,
I am fairly new to tensorflow and programming, hence there may be obvious errors I just overlook.
My dev environment consists of vs code running in a tensorflow docker container + venv (tf 2.5.3, python 3.8.10) running on a windows machine.
As I read in countless posts, the error I encounter most likely stems from a disconnect in the tensorflow graph. However I can't pinpoint where it happens or whether I am ignorant to something really basic in my code.
UPDATE
There was a simple error in my code as pointed out by @xdurch0, where I didn't define the loss function inside the tape scope. However there was another problem:
There is a bug in the tf.concat method (bug report on github) when scalars are concatenated, which led to a cryptic division by zero error. This can be circumvented by expanding the scalar to a single entry list using e.g. tf.expand_dims.
Sorry for the pasta a detailled description follows:
Description of what I try to do:
I first train a model on MNIST, which hidden layers I would like to examine with the a method that relies on the deep kernel I try to learn later.
# this is the code from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
from __future__ import print_function
from tarfile import SYMTYPE
import tensorflow.keras as keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
import tensorflow.compat.v1.keras.backend as K
import tensorflow as tf
# batch_size = 128
num_classes = 10
epochs = 12
# input image dimensions
img_rows, img_cols = 28, 28
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
model.summary()
After the model is successfully trained (this works without a problem), I would like to train the deep kernel on it to give me a measure of similarity between probability distributions on basis of single examples:
# semantic aware kernel mit maxpooling layer (model.layers[3])
# parameters w=(eps0, h1, h2), h1: bandwidth_sem_layer, h2: bandwidth
# @tf.function
def gaussian_kernel(x, y, h):  
    k = 1/(tf.sqrt(2*tf.constant(np.pi))) * tf.exp(-(tf.abs(x-y)**2)/(2*h+1e-10))
    return tf.math.reduce_sum(k), k
# @tf.function
def s_f(x, y, h, sem_layer=sem_layer, model=model):
    mod = keras.Model(model.inputs, model.layers[sem_layer].output)
    s = gaussian_kernel(mod(x), mod(y), h)
    return s
# @tf.function
def k_w(x,y, h1, h2, eps0, sem_layer=sem_layer, model=model): 
    k = ((1-eps0)*s_f(x, y, h1, sem_layer=sem_layer, model=model)[0] +eps0)*gaussian_kernel(x,y, h2)[0]
    return k 
# @tf.function
def SAMMD2(x_data, y_data, h1, h2, eps0, sem_layer=sem_layer, model=model):
    '''
    calculates SAMMD^2 value. 
    Returns: SAMMD^2 value as M and array H
    '''
    assert x_data.shape == y_data.shape
    n = x_data.shape[0]
    h = tf.constant(0, tf.float32)
    for i in range(n):
        
        for j in range(n):
            if i ==0 and j ==0:
                continue
            else:
                h0 = h
                if j==1:
                    if j!=i:
                        h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    
                        h = tf.concat([h0,h], axis=0)
                    if j==i:
                        h=tf.constant(0, tf.float32)
                        h = tf.concat([h0,h], axis=0)
                elif j==i:
                    h=tf.constant(0, tf.float32)
                    h = tf.concat([h0,tf.expand_dims(h, axis=0)], axis=0)
                elif j>1:
                    h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    
                    h = tf.concat([h0,tf.expand_dims(h, axis=0)], axis=0)
                else:
                    h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    
        if i==0:
            H0 = h
        elif i==1:
            H = tf.concat([H0,h], axis=0)
            H0 = H
        elif i>1:
            H = tf.concat([H0,h], axis=0)
            H0 = H       
    H = tf.reshape(H, shape=(n,n))
    M = (1/ (n*(n-1))) * tf.math.reduce_sum(H);  
    return M, H, n
# @tf.function
def sigma2(H, n, reg_lambda=1e-8):
    H2 = (tf.math.reduce_sum(H, axis=1))**2
    out = (4/ n**3) * tf.math.reduce_sum(H2) - (4/ n**4) * (tf.math.reduce_sum(H))**2 + tf.constant(reg_lambda)
    return out
epochs = 2
sem_layer = 3
x_data = x_test[0:10,...]
y_data = x_test[0:10,...]
# Optimierung 
optimizer = keras.optimizers.Adam(learning_rate=0.0002)
h1      = tf.Variable(0.5)  
h2      = tf.Variable(0.5)
eps0    = tf.Variable(0.01)
n = 2 # x_data.shape[0]
mod = keras.Model(model.inputs, model.layers[sem_layer].output)
reg_lambda=1e-8
# Training loop
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    
    # for i in range(n):
    with tf.GradientTape() as tape:
        tape.watch([h1, h2, eps0])
        M, H, m = SAMMD2(x_data, y_data, h1, h2, eps0, sem_layer=sem_layer, model=model)
        assert M is not None
        assert H is not None
        V = sigma2(H, m)
        J =  M / (tf.sqrt(V) + 1e-10)
        # cost_value = cost(x_data, y_data) #, h1, h2, eps0, sem_layer=sem_layer)
        
    grads = tape.gradient(-J, [h1, h2, eps0])
    
    optimizer.apply_gradients(zip(grads, [h1, h2, eps0]))
    print(f'epoch {epoch} J value = {J}')
When I run the code I get an error is raised in the first epoch, when apply_gradients is called:
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tf/SHAP/MNIST_LRP.py in <cell line: 2>()
     293     # cost_value = cost(x_data, y_data) #, h1, h2, eps0, sem_layer=sem_layer)
     297 grads = tape.gradient(-J, [h1, h2, eps0])
---> 299 optimizer.apply_gradients(zip(grads, [h1, h2, eps0]))
     301 print(f'epoch {epoch} J value = {J}')
File /tf/.env/shapvenv/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:630, in OptimizerV2.apply_gradients(self, grads_and_vars, name, experimental_aggregate_gradients)
    589 def apply_gradients(self,
    590                     grads_and_vars,
    591                     name=None,
    592                     experimental_aggregate_gradients=True):
    593   """Apply gradients to variables.
    594 
    595   This is the second part of `minimize()`. It returns an `Operation` that
   (...)
    628     RuntimeError: If called in a cross-replica context.
    629   """
--> 630   grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
    631   var_list = [v for (_, v) in grads_and_vars]
    633   with ops.name_scope_v2(self._name):
    634     # Create iteration if necessary.
File /tf/.env/shapvenv/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/utils.py:75, in filter_empty_gradients(grads_and_vars)
     72 filtered = tuple(filtered)
     74 if not filtered:
---> 75   raise ValueError("No gradients provided for any variable: %s." %
     76                    ([v.name for _, v in grads_and_vars],))
     77 if vars_with_empty_grads:
     78   logging.warning(
     79       ("Gradients do not exist for variables %s when minimizing the loss."),
     80       ([v.name for v in vars_with_empty_grads]))
ValueError: No gradients provided for any variable: ['Variable:0', 'Variable:0', 'Variable:0'].
I don't understand how to fix this empty gradient and would be very appriciative of your help.