I want to train a 3D model based on tensorflow. I'm using a node in a cluster with 8 GPUs (Geforce GTX 1080 ti, with 1045 MB memory for each one). My inputs are 3D CT images. When I reduce the image dimensions to 64×64×64 the model is trained appropriately. However, the following error is shown when the input dimensions are 128×128×128.
2023-10-18 07:08:18.796407: W tensorflow/tsl/framework/bfc_allocator.cc:497] *****************************************___________________________________________________________
Traceback (most recent call last):
File "/my_codes/my_model.py", line 419, in <module>
model_unet3D.fit(x_train, y_train,
File "/.local/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/.local/lib/python3.11/site-packages/tensorflow/python/framework/constant_op.py", line 98, in convert_to_eager_tensor
return ops.EagerTensor(value, ctx.device_name, dtype)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tensorflow.python.framework.errors_impl.InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.
I have used "tf.distribute" for running on multiple GPUs.
# Tensorflow Distributed Strategy (for running on multiple GPUs)
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
input_shape=(n_slices, width, height, n_channels)
model_unet3D = unet3D(input_shape)
model_unet3D.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-4),
loss= tf.keras.losses.CategoricalCrossentropy(), #dice_plus_focal_loss,
metrics=['acc',mean_iou, dice_coef],
)
model_unet3D.summary()
checkpoint_path = "/model/my_model.ckpt"
my_callbacks = [
tf.keras.callbacks.CSVLogger('train_log_3D.csv', separator=",", append=False),
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_best_only=True) #save_freq=5, save_weights_only=True, verbose=1
#tf.keras.callbacks.TensorBoard()
]
model_unet3D.fit(x_train, y_train,
validation_data=(x_test, y_test),
epochs=20,
batch_size=1,
verbose=1,
callbacks = my_callbacks)
How can I resolve the problem?
I have tried increasing the number of nodes and GPUs, but the same error appeared. I have also changed the batch size, but the memory limitation still remains even for a batch size of 1.
Update:
I have modified the code by using a custom data generator as follows:
class DataGenerator(Sequence):
def __init__(self, data, labels, batch_size, shuffle=True):
self.data = data
self.labels = labels
self.batch_size = batch_size
self.shuffle = shuffle
self.indexes = np.arange(len(self.data))
if self.shuffle:
np.random.shuffle(self.indexes)
def __len__(self):
return int(np.ceil(len(self.data) / self.batch_size))
def __getitem__(self, index):
start_idx = index * self.batch_size
end_idx = (index + 1) * self.batch_size
batch_data = self.data[start_idx:end_idx]
batch_labels = self.labels[start_idx:end_idx]
# Load and preprocess your 3D image data here if necessary
# e.g., using NumPy, PIL, or other image processing libraries
return batch_data, batch_labels
def on_epoch_end(self):
if self.shuffle:
np.random.shuffle(self.indexes)
batch_size = 1 # Adjust the batch size as needed
train_gen = DataGenerator(x_train, y_train, batch_size)
val_gen = DataGenerator(x_test, y_test, batch_size, shuffle=False)
model.fit(train_gen,
validation_data=val_gen,
epochs=20,
verbose=1,
)
Now, I have encountered the following error:
Traceback (most recent call last):
File "/my_codes/model.py", line 468, in <module>
model_unet3D.fit(train_gen,
File "/.local/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/.local/lib/python3.11/site-packages/tensorflow/python/eager/execute.py", line 53, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tensorflow.python.framework.errors_impl.UnknownError: Graph execution error