ValueError: Invalid dtype: object in Tensorflow model.fit

85 Views Asked by At

i am training a model on kaggle here is the code, i am having some problem in encoding my data, after multiple attempts i am here

https://codefile.io/f/bl9Smlnxtp



# Define transformations
train_datagen = ImageDataGenerator(
    rescale=1./255,  # Normalize pixel values
    rotation_range=20,  # Random rotations
    width_shift_range=0.2,  # Random horizontal shifts
    height_shift_range=0.2,  # Random vertical shifts
    shear_range=0.2,  # Random shearing transformations
    zoom_range=0.2,  # Random zoom
    horizontal_flip=True,  # Random horizontal flipping
    fill_mode='nearest'  # How to fill in new pixels after transformations
)

# Define datasets based on gender and view position
datasets = {
    'male_pa': df[(df['Patient Gender'] == 'M') & (df['View Position'] == 'PA')],
    'male_ap': df[(df['Patient Gender'] == 'M') & (df['View Position'] == 'AP')],
    'female_pa': df[(df['Patient Gender'] == 'F') & (df['View Position'] == 'PA')],
    'female_ap': df[(df['Patient Gender'] == 'F') & (df['View Position'] == 'AP')],
}

# Function to process labels
def process_labels(label):
    labels = label.split('|')
    encoded_labels = np.zeros(num_classes, dtype=int)  # Create a zero array
    for idx, disease in enumerate(all_diseases):
        if disease in labels:
            encoded_labels[idx] = 1
    return encoded_labels
# Get all unique diseases
all_diseases = set('|'.join(df['Finding Labels']).split('|'))
num_classes = len(all_diseases)  # Number of classes

# Create and process generators for each dataset
train_datasets = {}
validation_datasets = {}
for name, subset_df in datasets.items():


    # Determine all possible image folders based on the data
    image_folders = [os.path.join(data_dir, f"images_{i:03d}", "images")
                     for i in range(1, 13)]
    print(image_folders)

    def get_image_path(image_index):
        for folder in image_folders:
            full_path = os.path.join(folder, image_index)
            if os.path.exists(full_path):
                return full_path
        # Debugging print for missing images
        return None

    # Construct the 'image_path' column, searching for files across folders
    subset_df = subset_df.copy()  # Make explicit copies to ensure modification
    subset_df.loc[:, 'image_path'] = subset_df['Image Index'].apply(get_image_path)

    # Filter out missing images (if needed)
    subset_df = subset_df[subset_df['image_path'].notnull()]

    # Split data into training and test sets
    train_df, test_df = train_test_split(subset_df, test_size=0.2, random_state=42)

    # Training dataset generator
    train_dataset_generator = train_datagen.flow_from_dataframe(
        dataframe=train_df,
        directory=None,
        target_size=(224, 224),
        batch_size=32,
        class_mode='raw',
        x_col="image_path",
        y_col="Finding Labels",
        y_col_preprocessor=process_labels,
        shuffle=False,
        interpolation="nearest",
        validate_filenames=True,
        dtype='int32'  # <-- Set the data type for the labels
    )

    train_datasets[name] = train_dataset_generator
    print("hi")
    # Test dataset generator
    test_datagen = ImageDataGenerator(rescale=1./255)  # No augmentation for test
    test_dataset_generator = test_datagen.flow_from_dataframe(
        dataframe=test_df,
        directory=None,
        target_size=(224, 224),
        batch_size=32,
        class_mode='raw',
        x_col="image_path",
        y_col="Finding Labels",
        y_col_preprocessor=process_labels,
        shuffle=False,
        interpolation="nearest",
        validate_filenames=True,
        dtype='int32'  # <-- Set the data type for the labels
    )

validation_datasets[name] = test_dataset_generator

# Print the number of classes
print("Number of classes:", num_classes)

for dataset_name, train_dataset in train_datasets.items():

    print(f"Training model for dataset: {dataset_name}")

    # Load the ResNet50 model without the top classification layer
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    # Freeze the base model's layers
    for layer in base_model.layers:
        layer.trainable = False

    # Add new classification layers on top of the base model
    x = base_model.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(1024, activation='relu')(x)
    predictions = tf.keras.layers.Dense(num_classes, activation='sigmoid')(x)

    # Create the final model
    model = tf.keras.Model(inputs=base_model.input, outputs=predictions)

    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Train the model
    epochs = 10
    model.fit(train_dataset,
              epochs=epochs,
              verbose=1,
              validation_data=train_dataset,
              validation_freq=1)  # Validate after each epoch

    # Save the trained model
    model.save(f'chest_xray_model_{dataset_name}.h5')

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[91], line 28
     26 # Train the model
     27 epochs = 10
---> 28 model.fit(train_dataset,
     29           epochs=epochs,
     30           verbose=1,
     31           validation_data=train_dataset,
     32           validation_freq=1)  # Validate after each epoch
     34 # Save the trained model
     35 model.save(f'chest_xray_model_{dataset_name}.h5')

File /opt/conda/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py:123, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    120     filtered_tb = _process_traceback_frames(e.__traceback__)
    121     # To get the full stack trace, call:
    122     # `keras.config.disable_traceback_filtering()`
--> 123     raise e.with_traceback(filtered_tb) from None
    124 finally:
    125     del filtered_tb

File /opt/conda/lib/python3.10/site-packages/tree/__init__.py:435, in map_structure(func, *structures, **kwargs)
    432 for other in structures[1:]:
    433   assert_same_structure(structures[0], other, check_types=check_types)
    434 return unflatten_as(structures[0],
--> 435                     [func(*args) for args in zip(*map(flatten, structures))])

File /opt/conda/lib/python3.10/site-packages/tree/__init__.py:435, in <listcomp>(.0)
    432 for other in structures[1:]:
    433   assert_same_structure(structures[0], other, check_types=check_types)
    434 return unflatten_as(structures[0],
--> 435                     [func(*args) for args in zip(*map(flatten, structures))])

ValueError: Invalid dtype: object

I tried encoding at different place, from enconging right when importing the csv to while creating datagen to right before fitting the model still now able to work

1

There are 1 best solutions below

0
Maksym Stetsenko On

"ValueError: Invalid dtype: object" suggest that there are arrays (perhaps, numpy arrays) that contain object type data. To verify that, it is better to try passing to fit separately features and labels dataset for each name. While doing so, dataset can be converted to tensors first with use of standard TF functions (use tf.constant or create dataset as tf.data.Dataset.from_tensor_slices). If successfully converted, problem should disappear.