TensorFlow Conv2DTranspose's Type mismatch: actual int32 vs. expect string error on GPU

38 Views Asked by At

I am trying to train a variational autoencoder in TensorFlow which is causing a weird error that I cannot replicate on CPU with the same code.

Relevant parts of my model -

    @tf.function
    def sample(self, apply_clip=True):
        eps = tf.random.normal(shape=(100, int(self.encoder.output_shape / 2)))
        return self.decode(eps, apply_clip=apply_clip)

    def encode(self, x):
        z_mean, z_log_var = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
        return z_mean, z_log_var

    def reparameterize(self, z_mean, z_log_var):
        eps = tf.random.normal(shape=z_mean.shape)
        return eps * tf.exp(z_log_var * 0.5) + z_mean

    def decode(self, z, apply_clip=True):
        yp = self.decoder(z)
        if apply_clip:
            yp = tf.clip_by_value(yp, 0, 1)
        return yp

    def train(
        self,
        Dtr,
        Dte=None,
        epochs=10,
        plot_samples=True,
        xs=None,
        ys=None,
        y_ax_red=(1, 2),
    ):
        if plot_samples:
            self.plot_samples(xs, ys)
        for epoch in range(1, epochs + 1):
            start_time = time.time()
            for d in tqdm(Dtr, desc="training_dataset"):
                if self.guide is not None:
                    self.train_step(d[0], d[1], d[2], y_ax_red=y_ax_red)
                else:
                    self.train_step(d[0], d[1], y_ax_red=y_ax_red)
            if Dte is not None:
                test_loss = 0
                num_test_batches = 0
                for d in tqdm(Dte, desc="test_dataset"):
                    z_mean, z_log_var = self.encode(d[0])
                    z = self.reparameterize(z_mean, z_log_var)
                    reconstruction = self.decode(z)
                    reconstruction_loss = tf.reduce_mean(
                        tf.reduce_sum(
                            tf.keras.losses.MSE(d[1], reconstruction), axis=y_ax_red
                        )
                    )
                    kl_loss = -0.5 * (
                        1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
                    )
                    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
                    total_loss = reconstruction_loss + kl_loss
                    test_loss += total_loss.numpy()
                    num_test_batches += 1
                avg_test_loss = test_loss / num_test_batches
            else:
                avg_test_loss = None
            end_time = time.time()

            display.clear_output(wait=False)
            print(f"Epoch: {epoch}, Loss_tr: {self.metrics}, Loss_te: {avg_test_loss}")
            self.reset_metrics()

            if plot_samples:
                self.plot_samples(xs, ys)

The encoder and decoder are regular sequential models that I define externally,

decoder = tf.keras.Sequential(
    [
        tf.keras.layers.InputLayer(input_shape=(128,)),
        tf.keras.layers.Dense(units=28 * 28 * 128, activation="relu"),
        tf.keras.layers.Reshape(target_shape=(28, 28, 128)),
        tf.keras.layers.Conv2DTranspose(
            filters=64, kernel_size=3, strides=2, padding='same',
            activation='relu'),
        tf.keras.layers.Conv2DTranspose(
            filters=32, kernel_size=3, strides=2, padding='same',
            activation='relu'),
        tf.keras.layers.Conv2DTranspose(
            filters=3, kernel_size=3, strides=1, padding='same',
            activation="sigmoid"),
    ]
)

When I try to train the model on GPU, following error shows up -

InvalidArgumentError: Exception encountered when calling layer 'conv2d_transpose_1' (type Conv2DTranspose).

{{function_node __wrapped__StridedSlice_device_/job:localhost/replica:0/task:0/device:GPU:0}} Type mismatch: actual int32 vs. expect string [Op:StridedSlice] name: sequential_4/conv2d_transpose_1/strided_slice/

Call arguments received by layer 'conv2d_transpose_1' (type Conv2DTranspose):
  • inputs=tf.Tensor(shape=(16, 56, 56, 64), dtype=float32)```

The weird thing is that the model trains for arbitrary number of batches and then suddenly this error shows up, even the particular layer that is printed in the last line of the error is sometimes different. No error is produced on the CPU whatsoever.

My data pipeline - 

def construct_dataset( mode="eic", batch_size=16, train_size=0.8, buffer_size=1000, d255=True, ):

def p_load(p):
    p = p.numpy().decode()
    return np.load(p).astype(np.float32)

def to_cat(c):
    cat_lab = np.zeros(len(use_classes), dtype=np.float32)
    cat_lab[use_classes.index(c)] = 1
    return cat_lab

elements = []
for e in mode:
    if e == "e":
        d = tf.data.Dataset.from_tensor_slices(use_eegs)
        d = d.map(
            lambda x: tf.py_function(p_load, inp=[x], Tout=tf.float32),
            num_parallel_calls=tf.data.AUTOTUNE,
            deterministic=True,
        )
        elements.append(d)

    elif e == "i":
        d = tf.data.Dataset.from_tensor_slices(use_imgs)
        d = d.map(
            lambda x: tf.py_function(p_load, inp=[x], Tout=tf.float32),
            num_parallel_calls=tf.data.AUTOTUNE,
            deterministic=True,
        )
        if d255:
            d = d.map(
                lambda x: tf.py_function(
                    lambda h: h/255, inp=[x], Tout=tf.float32
                    ),
                num_parallel_calls=tf.data.AUTOTUNE,
                deterministic=True,
            )
        elements.append(d)

    elif e == "c":
        d = tf.data.Dataset.from_tensor_slices(use_clsl)
        d = d.map(
            lambda x: tf.py_function(to_cat, inp=[x], Tout=tf.float32),
            num_parallel_calls=tf.data.AUTOTUNE,
            deterministic=True,
        )
        elements.append(d)

if len(elements) > 1:
    D = tf.data.Dataset.zip(tuple(elements))
else:
    D = elements[0]
D.shuffle(buffer_size)
D = D.ignore_errors()
Dtr = D.take(int(len(use_eegs) * train_size)).batch(batch_size, True)
Dtr = Dtr.prefetch(tf.data.AUTOTUNE)
Dte = D.skip(int(len(use_eegs) * train_size)).batch(batch_size, True)
Dte = Dte.prefetch(tf.data.AUTOTUNE)
return Dtr, Dte

I tried running the same dataset by not using its second image and instead using the class label that it produces to train a classification model, and it works on GPU, so I don't think the problem is in the dataset.

0

There are 0 best solutions below