I have the following:
- windows 11 Home
- GPU NVIDIA GeForce RTX 4090 Laptop
- CUDA v11.2
- cudnn 8.6
- tensorflow 2.10.0
I have a custom training loop with a custom generator as follows:
@tf.function
def loss(model, x, y, loss_fn, training):
y_pred = model(x, training=training)
loss_val = loss_fn(y, y_pred)
return loss_val, y_pred
@tf.function
def grad(model, inputs, targets, loss_fn):
with tf.GradientTape() as tape:
loss_value, y_pred = loss(model, inputs, targets, loss_fn, training=True)
return loss_value, y_pred, tape.gradient(loss_value, model.trainable_variables)
def train(model, train_gen, val_gen, n_epochs, class_balancing=True):
model_name = datetime.now().strftime('%Y%m%d-%H%M%S')
optimizer = tf.keras.optimizers.Adam()
if class_balancing:
_, full_y = train_gen.get_full_xy()
ratio_zero_to_ones = np.sum(full_y == 0) / np.sum(full_y == 1)
alpha = 1 - 1 / (1 + ratio_zero_to_ones)
else:
alpha = None
loss_fn = tf.keras.losses.BinaryFocalCrossentropy(apply_class_balancing=class_balancing, alpha=alpha, gamma=0)
project_abs_path = get_project_parent_path()
train_log_dir = f'{project_abs_path}/logs/gradient_tape/{model_name}/train'
val_log_dir = f'{project_abs_path}/logs/gradient_tape/{model_name}/val'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
val_summary_writer = tf.summary.create_file_writer(val_log_dir)
checkpoint_path = f'{project_abs_path}/training_ckpts/model_{model_name}.ckpt'
train_loss_results = []
train_acc_results = []
val_loss_results = []
val_acc_results = []
best_loss = np.inf
best_acc = 0
num_batches = len(train_gen)
for epoch in tqdm(range(n_epochs)):
epoch_loss_avg = 0
epoch_acc_avg = 0
for x, y in train_gen:
loss_val, y_pred, grads = grad(model, x, y, loss_fn)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
epoch_loss_avg += loss_val
epoch_acc_avg += accuracy_score(y, y_pred > 0.5)
epoch_loss_avg /= num_batches
epoch_acc_avg /= num_batches
x, y = val_gen.get_full_xy()
loss_val, y_pred = loss(model, x, y, loss_fn, False)
loss_val = loss_val.numpy()
acc_val = accuracy_score(y, y_pred > 0.5)
train_loss_results.append(epoch_loss_avg)
train_acc_results.append(epoch_acc_avg)
val_loss_results.append(loss_val)
val_acc_results.append(acc_val)
if epoch_loss_avg <= best_loss:
model.save_weights(checkpoint_path)
best_loss = epoch_loss_avg
if epoch_acc_avg >= best_acc:
best_acc = epoch_acc_avg
with train_summary_writer.as_default():
tf.summary.scalar('loss', epoch_loss_avg, step=epoch)
tf.summary.scalar('acc', epoch_acc_avg, step=epoch)
with val_summary_writer.as_default():
tf.summary.scalar('loss', loss_val, step=epoch)
tf.summary.scalar('acc', acc_val, step=epoch)
train_gen.on_epoch_end()
return model, None, checkpoint_path
When I train, I get this warning printed:
Begin Training
0%| | 0/3000 [00:00<?, ?it/s]2023-10-31 16:28:44.879587: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8600
2023-10-31 16:28:45.533824: I tensorflow/stream_executor/gpu/asm_compiler.cc:189] Using ptxas.exe
2023-10-31 16:28:45.535161: I tensorflow/stream_executor/gpu/asm_compiler.cc:262] ptx written to: C:\Users\***\AppData\Local\Temp\/tempfile-RADAR_2-5174-14576-60903f8e4673d
2023-10-31 16:28:45.535362: I tensorflow/stream_executor/gpu/asm_compiler.cc:291] ptxas.exe C:\Users\***\AppData\Local\Temp\/tempfile-RADAR_2-5174-14576-60903f8e4673d -o C:\Users\***\AppData\Local\Temp\/tempfile-RADAR_2-5174-14576-60903f8e46c5f -arch=sm_89 --warn-on-spills -v
2023-10-31 16:28:45.594853: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: ptxas exited with non-zero error code -1, output:
Relying on driver to perform ptx compilation.
Modify $PATH to customize ptxas location.
This message will be only logged once.
2023-10-31 16:28:45.604158: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
I think I have the right paths in my environment variables
It seems like it is using my GPU during training, so what does this error mean and how can I solve it?
Try this
conda install -c nvidia cuda-nvcc
.It will add required packages to your environment.