I am trying to train a variational autoencoder in TensorFlow which is causing a weird error that I cannot replicate on CPU with the same code.
Relevant parts of my model -
@tf.function
def sample(self, apply_clip=True):
eps = tf.random.normal(shape=(100, int(self.encoder.output_shape / 2)))
return self.decode(eps, apply_clip=apply_clip)
def encode(self, x):
z_mean, z_log_var = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
return z_mean, z_log_var
def reparameterize(self, z_mean, z_log_var):
eps = tf.random.normal(shape=z_mean.shape)
return eps * tf.exp(z_log_var * 0.5) + z_mean
def decode(self, z, apply_clip=True):
yp = self.decoder(z)
if apply_clip:
yp = tf.clip_by_value(yp, 0, 1)
return yp
def train(
self,
Dtr,
Dte=None,
epochs=10,
plot_samples=True,
xs=None,
ys=None,
y_ax_red=(1, 2),
):
if plot_samples:
self.plot_samples(xs, ys)
for epoch in range(1, epochs + 1):
start_time = time.time()
for d in tqdm(Dtr, desc="training_dataset"):
if self.guide is not None:
self.train_step(d[0], d[1], d[2], y_ax_red=y_ax_red)
else:
self.train_step(d[0], d[1], y_ax_red=y_ax_red)
if Dte is not None:
test_loss = 0
num_test_batches = 0
for d in tqdm(Dte, desc="test_dataset"):
z_mean, z_log_var = self.encode(d[0])
z = self.reparameterize(z_mean, z_log_var)
reconstruction = self.decode(z)
reconstruction_loss = tf.reduce_mean(
tf.reduce_sum(
tf.keras.losses.MSE(d[1], reconstruction), axis=y_ax_red
)
)
kl_loss = -0.5 * (
1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
)
kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
total_loss = reconstruction_loss + kl_loss
test_loss += total_loss.numpy()
num_test_batches += 1
avg_test_loss = test_loss / num_test_batches
else:
avg_test_loss = None
end_time = time.time()
display.clear_output(wait=False)
print(f"Epoch: {epoch}, Loss_tr: {self.metrics}, Loss_te: {avg_test_loss}")
self.reset_metrics()
if plot_samples:
self.plot_samples(xs, ys)
The encoder and decoder are regular sequential models that I define externally,
decoder = tf.keras.Sequential(
[
tf.keras.layers.InputLayer(input_shape=(128,)),
tf.keras.layers.Dense(units=28 * 28 * 128, activation="relu"),
tf.keras.layers.Reshape(target_shape=(28, 28, 128)),
tf.keras.layers.Conv2DTranspose(
filters=64, kernel_size=3, strides=2, padding='same',
activation='relu'),
tf.keras.layers.Conv2DTranspose(
filters=32, kernel_size=3, strides=2, padding='same',
activation='relu'),
tf.keras.layers.Conv2DTranspose(
filters=3, kernel_size=3, strides=1, padding='same',
activation="sigmoid"),
]
)
When I try to train the model on GPU, following error shows up -
InvalidArgumentError: Exception encountered when calling layer 'conv2d_transpose_1' (type Conv2DTranspose).
{{function_node __wrapped__StridedSlice_device_/job:localhost/replica:0/task:0/device:GPU:0}} Type mismatch: actual int32 vs. expect string [Op:StridedSlice] name: sequential_4/conv2d_transpose_1/strided_slice/
Call arguments received by layer 'conv2d_transpose_1' (type Conv2DTranspose):
• inputs=tf.Tensor(shape=(16, 56, 56, 64), dtype=float32)```
The weird thing is that the model trains for arbitrary number of batches and then suddenly this error shows up, even the particular layer that is printed in the last line of the error is sometimes different. No error is produced on the CPU whatsoever.
My data pipeline -
def construct_dataset( mode="eic", batch_size=16, train_size=0.8, buffer_size=1000, d255=True, ):
def p_load(p):
p = p.numpy().decode()
return np.load(p).astype(np.float32)
def to_cat(c):
cat_lab = np.zeros(len(use_classes), dtype=np.float32)
cat_lab[use_classes.index(c)] = 1
return cat_lab
elements = []
for e in mode:
if e == "e":
d = tf.data.Dataset.from_tensor_slices(use_eegs)
d = d.map(
lambda x: tf.py_function(p_load, inp=[x], Tout=tf.float32),
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=True,
)
elements.append(d)
elif e == "i":
d = tf.data.Dataset.from_tensor_slices(use_imgs)
d = d.map(
lambda x: tf.py_function(p_load, inp=[x], Tout=tf.float32),
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=True,
)
if d255:
d = d.map(
lambda x: tf.py_function(
lambda h: h/255, inp=[x], Tout=tf.float32
),
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=True,
)
elements.append(d)
elif e == "c":
d = tf.data.Dataset.from_tensor_slices(use_clsl)
d = d.map(
lambda x: tf.py_function(to_cat, inp=[x], Tout=tf.float32),
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=True,
)
elements.append(d)
if len(elements) > 1:
D = tf.data.Dataset.zip(tuple(elements))
else:
D = elements[0]
D.shuffle(buffer_size)
D = D.ignore_errors()
Dtr = D.take(int(len(use_eegs) * train_size)).batch(batch_size, True)
Dtr = Dtr.prefetch(tf.data.AUTOTUNE)
Dte = D.skip(int(len(use_eegs) * train_size)).batch(batch_size, True)
Dte = Dte.prefetch(tf.data.AUTOTUNE)
return Dtr, Dte
I tried running the same dataset by not using its second image and instead using the class label that it produces to train a classification model, and it works on GPU, so I don't think the problem is in the dataset.