encountering ptxas.exe warning while trying to run tensorflow with GPU

138 Views Asked by At

I have the following:

  • windows 11 Home
  • GPU NVIDIA GeForce RTX 4090 Laptop
  • CUDA v11.2
  • cudnn 8.6
  • tensorflow 2.10.0

I have a custom training loop with a custom generator as follows:

@tf.function
def loss(model, x, y, loss_fn, training):
    y_pred = model(x, training=training)
    loss_val = loss_fn(y, y_pred)
    return loss_val, y_pred


@tf.function
def grad(model, inputs, targets, loss_fn):
    with tf.GradientTape() as tape:
        loss_value, y_pred = loss(model, inputs, targets, loss_fn, training=True)
        return loss_value, y_pred, tape.gradient(loss_value, model.trainable_variables)


def train(model, train_gen, val_gen, n_epochs, class_balancing=True):

    model_name = datetime.now().strftime('%Y%m%d-%H%M%S')

    optimizer = tf.keras.optimizers.Adam()
    if class_balancing:
        _, full_y = train_gen.get_full_xy()
        ratio_zero_to_ones = np.sum(full_y == 0) / np.sum(full_y == 1)
        alpha = 1 - 1 / (1 + ratio_zero_to_ones)
    else:
        alpha = None

    loss_fn = tf.keras.losses.BinaryFocalCrossentropy(apply_class_balancing=class_balancing, alpha=alpha, gamma=0)

    project_abs_path = get_project_parent_path()
    train_log_dir = f'{project_abs_path}/logs/gradient_tape/{model_name}/train'
    val_log_dir = f'{project_abs_path}/logs/gradient_tape/{model_name}/val'

    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
    val_summary_writer = tf.summary.create_file_writer(val_log_dir)
    checkpoint_path = f'{project_abs_path}/training_ckpts/model_{model_name}.ckpt'
    train_loss_results = []
    train_acc_results = []
    val_loss_results = []
    val_acc_results = []

    best_loss = np.inf
    best_acc = 0

    num_batches = len(train_gen)

    for epoch in tqdm(range(n_epochs)):

        epoch_loss_avg = 0
        epoch_acc_avg = 0
        for x, y in train_gen:
            loss_val, y_pred, grads = grad(model, x, y, loss_fn)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            epoch_loss_avg += loss_val
            epoch_acc_avg += accuracy_score(y, y_pred > 0.5)
        epoch_loss_avg /= num_batches
        epoch_acc_avg /= num_batches

        x, y = val_gen.get_full_xy()
        loss_val, y_pred = loss(model, x, y, loss_fn, False)
        loss_val = loss_val.numpy()
        acc_val = accuracy_score(y, y_pred > 0.5)

        train_loss_results.append(epoch_loss_avg)
        train_acc_results.append(epoch_acc_avg)
        val_loss_results.append(loss_val)
        val_acc_results.append(acc_val)

        if epoch_loss_avg <= best_loss:
            model.save_weights(checkpoint_path)
            best_loss = epoch_loss_avg

        if epoch_acc_avg >= best_acc:
            best_acc = epoch_acc_avg

        with train_summary_writer.as_default():
            tf.summary.scalar('loss', epoch_loss_avg, step=epoch)
            tf.summary.scalar('acc', epoch_acc_avg, step=epoch)

        with val_summary_writer.as_default():
            tf.summary.scalar('loss', loss_val, step=epoch)
            tf.summary.scalar('acc', acc_val, step=epoch)
        train_gen.on_epoch_end()
    return model, None, checkpoint_path

When I train, I get this warning printed:

Begin Training
  0%|          | 0/3000 [00:00<?, ?it/s]2023-10-31 16:28:44.879587: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8600
2023-10-31 16:28:45.533824: I tensorflow/stream_executor/gpu/asm_compiler.cc:189] Using ptxas.exe
2023-10-31 16:28:45.535161: I tensorflow/stream_executor/gpu/asm_compiler.cc:262] ptx written to: C:\Users\***\AppData\Local\Temp\/tempfile-RADAR_2-5174-14576-60903f8e4673d
2023-10-31 16:28:45.535362: I tensorflow/stream_executor/gpu/asm_compiler.cc:291] ptxas.exe C:\Users\***\AppData\Local\Temp\/tempfile-RADAR_2-5174-14576-60903f8e4673d -o C:\Users\***\AppData\Local\Temp\/tempfile-RADAR_2-5174-14576-60903f8e46c5f -arch=sm_89 --warn-on-spills -v
2023-10-31 16:28:45.594853: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: ptxas exited with non-zero error code -1, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2023-10-31 16:28:45.604158: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.

I think I have the right paths in my environment variables

img attached

It seems like it is using my GPU during training, so what does this error mean and how can I solve it?

1

There are 1 best solutions below

0
On

Try this conda install -c nvidia cuda-nvcc.

It will add required packages to your environment.