i am training a model on kaggle here is the code, i am having some problem in encoding my data, after multiple attempts i am here
https://codefile.io/f/bl9Smlnxtp
# Define transformations
train_datagen = ImageDataGenerator(
rescale=1./255, # Normalize pixel values
rotation_range=20, # Random rotations
width_shift_range=0.2, # Random horizontal shifts
height_shift_range=0.2, # Random vertical shifts
shear_range=0.2, # Random shearing transformations
zoom_range=0.2, # Random zoom
horizontal_flip=True, # Random horizontal flipping
fill_mode='nearest' # How to fill in new pixels after transformations
)
# Define datasets based on gender and view position
datasets = {
'male_pa': df[(df['Patient Gender'] == 'M') & (df['View Position'] == 'PA')],
'male_ap': df[(df['Patient Gender'] == 'M') & (df['View Position'] == 'AP')],
'female_pa': df[(df['Patient Gender'] == 'F') & (df['View Position'] == 'PA')],
'female_ap': df[(df['Patient Gender'] == 'F') & (df['View Position'] == 'AP')],
}
# Function to process labels
def process_labels(label):
labels = label.split('|')
encoded_labels = np.zeros(num_classes, dtype=int) # Create a zero array
for idx, disease in enumerate(all_diseases):
if disease in labels:
encoded_labels[idx] = 1
return encoded_labels
# Get all unique diseases
all_diseases = set('|'.join(df['Finding Labels']).split('|'))
num_classes = len(all_diseases) # Number of classes
# Create and process generators for each dataset
train_datasets = {}
validation_datasets = {}
for name, subset_df in datasets.items():
# Determine all possible image folders based on the data
image_folders = [os.path.join(data_dir, f"images_{i:03d}", "images")
for i in range(1, 13)]
print(image_folders)
def get_image_path(image_index):
for folder in image_folders:
full_path = os.path.join(folder, image_index)
if os.path.exists(full_path):
return full_path
# Debugging print for missing images
return None
# Construct the 'image_path' column, searching for files across folders
subset_df = subset_df.copy() # Make explicit copies to ensure modification
subset_df.loc[:, 'image_path'] = subset_df['Image Index'].apply(get_image_path)
# Filter out missing images (if needed)
subset_df = subset_df[subset_df['image_path'].notnull()]
# Split data into training and test sets
train_df, test_df = train_test_split(subset_df, test_size=0.2, random_state=42)
# Training dataset generator
train_dataset_generator = train_datagen.flow_from_dataframe(
dataframe=train_df,
directory=None,
target_size=(224, 224),
batch_size=32,
class_mode='raw',
x_col="image_path",
y_col="Finding Labels",
y_col_preprocessor=process_labels,
shuffle=False,
interpolation="nearest",
validate_filenames=True,
dtype='int32' # <-- Set the data type for the labels
)
train_datasets[name] = train_dataset_generator
print("hi")
# Test dataset generator
test_datagen = ImageDataGenerator(rescale=1./255) # No augmentation for test
test_dataset_generator = test_datagen.flow_from_dataframe(
dataframe=test_df,
directory=None,
target_size=(224, 224),
batch_size=32,
class_mode='raw',
x_col="image_path",
y_col="Finding Labels",
y_col_preprocessor=process_labels,
shuffle=False,
interpolation="nearest",
validate_filenames=True,
dtype='int32' # <-- Set the data type for the labels
)
validation_datasets[name] = test_dataset_generator
# Print the number of classes
print("Number of classes:", num_classes)
for dataset_name, train_dataset in train_datasets.items():
print(f"Training model for dataset: {dataset_name}")
# Load the ResNet50 model without the top classification layer
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
# Freeze the base model's layers
for layer in base_model.layers:
layer.trainable = False
# Add new classification layers on top of the base model
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
predictions = tf.keras.layers.Dense(num_classes, activation='sigmoid')(x)
# Create the final model
model = tf.keras.Model(inputs=base_model.input, outputs=predictions)
# Compile the model
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# Train the model
epochs = 10
model.fit(train_dataset,
epochs=epochs,
verbose=1,
validation_data=train_dataset,
validation_freq=1) # Validate after each epoch
# Save the trained model
model.save(f'chest_xray_model_{dataset_name}.h5')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[91], line 28
26 # Train the model
27 epochs = 10
---> 28 model.fit(train_dataset,
29 epochs=epochs,
30 verbose=1,
31 validation_data=train_dataset,
32 validation_freq=1) # Validate after each epoch
34 # Save the trained model
35 model.save(f'chest_xray_model_{dataset_name}.h5')
File /opt/conda/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py:123, in filter_traceback.<locals>.error_handler(*args, **kwargs)
120 filtered_tb = _process_traceback_frames(e.__traceback__)
121 # To get the full stack trace, call:
122 # `keras.config.disable_traceback_filtering()`
--> 123 raise e.with_traceback(filtered_tb) from None
124 finally:
125 del filtered_tb
File /opt/conda/lib/python3.10/site-packages/tree/__init__.py:435, in map_structure(func, *structures, **kwargs)
432 for other in structures[1:]:
433 assert_same_structure(structures[0], other, check_types=check_types)
434 return unflatten_as(structures[0],
--> 435 [func(*args) for args in zip(*map(flatten, structures))])
File /opt/conda/lib/python3.10/site-packages/tree/__init__.py:435, in <listcomp>(.0)
432 for other in structures[1:]:
433 assert_same_structure(structures[0], other, check_types=check_types)
434 return unflatten_as(structures[0],
--> 435 [func(*args) for args in zip(*map(flatten, structures))])
ValueError: Invalid dtype: object
I tried encoding at different place, from enconging right when importing the csv to while creating datagen to right before fitting the model still now able to work
"ValueError: Invalid dtype: object" suggest that there are arrays (perhaps, numpy arrays) that contain object type data. To verify that, it is better to try passing to fit separately features and labels dataset for each name. While doing so, dataset can be converted to tensors first with use of standard TF functions (use tf.constant or create dataset as tf.data.Dataset.from_tensor_slices). If successfully converted, problem should disappear.