I have trained a deep neural network for regression, with 2 input neurons, 1 output neuron and some hidden layers, as in the following (Tensorflow 2):
import numpy as np
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import losses
import tensorflow as tf
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
#Creation of a "synthetic" dataset
x1 = np.linspace(0, 6*np.pi, 2000)
x2 = 1.5 * np.linspace(0, 6*np.pi, 2000)
y = np.sin(x1) + np.cos(x2)
data = pd.DataFrame(np.array([x1, x2, y]).transpose(), columns = ['x1', 'x2', 'y'])
# train/test split and definition of the normalization over the training set
train_df, test_df = train_test_split(data, test_size=0.2, random_state=0)
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_df.iloc[:, :-1]))
#Definition of the DNN structure
def build_and_compile_model(norm):
model = keras.Sequential([
norm,
layers.Dense(64, input_dim=2, activation='LeakyReLU'),
layers.Dense(64, activation='LeakyReLU'),
layers.Dense(32, activation='LeakyReLU'),
layers.Dense(32, activation='LeakyReLU'),
layers.Dense(16, activation='LeakyReLU'),
layers.Dense(16, activation='LeakyReLU'),
layers.Dense(8, activation='LeakyReLU'),
layers.Dense(1, activation = 'linear')
])
model.compile(loss='mean_absolute_error',
optimizer=tf.keras.optimizers.Adam(0.001))
return model
model = build_and_compile_model(normalizer)
# Train of the DNN
%%time
history = model.fit(
train_df.iloc[:, :-1],
train_df.iloc[:, -1],
validation_split=0.2,
verbose=2, epochs=100)
Now, if y is the prediction of the network, I want to compute partial derivatives dy/dx1 and dy/dx2. To achieve this, I have tried
x = tf.constant(data.iloc[:, :-1].values)
with tf.GradientTape(persistent = True) as t:
t.watch(x)
y = model(x)
dy_dx = t.gradient(y, x)
dy_dx.numpy()
If I plot the y as a function of x1 (or of x2), and I compare it with the analytical result from the definition I have given above, I get a good agreement:
plt.figure(figsize = (5, 3), dpi = 190)
plt.plot(x1, model.predict(x), label = 'model prediction')
plt.plot(x1, np.sin(x1) + np.cos(x2), label = 'analytical result')
plt.xlabel('$x_1$')
plt.legend()
plt.show()
On the contrary, if I plot the first column of the vector dy_dx and I compare it with the analytical derivative (dy/dx1 = cos(x1)), they do not match (similar situation for the other partial derivative):
plt.figure(figsize = (5, 3), dpi = 190)
plt.plot(x1, dy_dx[:, 0], label = 'autodiff result')
plt.plot(x1, np.cos(x1), label = 'analytical result')
plt.xlabel('$x_1$')
plt.legend()
plt.show()
If I compare this gradient with the finite differences, I get
plt.figure(figsize = (5, 3), dpi = 190)
plt.plot(x1, dy_dx[:, 0], label = 'autodiff result')
plt.plot(x1[0:-1], np.diff(y.numpy()[:, 0])/.1e-1, label = 'finite differences')
plt.xlabel('$x_1$')
plt.legend()
plt.show()
So, since the autodiff result and the finite difference result are equal up to a scaling constant, this means that autodiff is not computing the partial derivative dy/dx1, but it is only computing the total derivative, plotting it over one of the variables.
So, my question remains: how to compute partial derivatives?