I'm training an autoencoder to detect anomalies among pictures based on the decoder error value. I tried out different ways of image preprocessing, NN architectures, losses, activation functions, image normalisations, augmentations, etc. The optimal model seems a bit unintuitive to be because it uses ReLU and MSE. I'd expect a sigmoid on the last (decoder) layer and binary cross-entropy to win the competition. Can you advise if my solution is alright or how to adjust it to follow the standard for this kind of a task? (I'm worried that I'm making some basic errors.)
# Loading the dataset
def load_and_preprocess_image(img_path, target_size=(256, 256)):
img = image.load_img(img_path, target_size=target_size)#, color_mode='grayscale')
img_array = image.img_to_array(img)
img_array = np.expand_dims(img_array, axis=0)
img_array = img_array / 255.0 # Scale pixel values
return img_array
image_directory = 'images'
image_paths = [os.path.join(image_directory, img) for img in os.listdir(image_directory) if img in task_images0.StoragePath.to_list()]
img_size = 256
images = np.vstack([load_and_preprocess_image(img_path, target_size=(img_size, img_size)) for img_path in image_paths])
# Model architecture
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
# Define the level of L2 regularization
l2_reg = l2(0.01)
img_height, img_width = images[0].shape[:2]
channels = 3
input_img = Input(shape=(img_height, img_width, channels))
# Encoder
x = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2_reg)(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Dropout(0.1)(x) # Dropout layer
x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2_reg)(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)
# Decoder
x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2_reg)(encoded)
x = UpSampling2D((2, 2))(x)
x = Dropout(0.1)(x) # Dropout layer
x = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2_reg)(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (3, 3), activation='relu', padding='same')(x)
# Model training.
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.backend import clear_session, set_value
clear_session()
# Your existing model setup
autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer=Adam(0.005), loss='mse')
# Callback for early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min', restore_best_weights=True)
# Callback to reduce learning rate
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=0, mode='min')
# Callback to save the model with the lowest validation loss
model_checkpoint = ModelCheckpoint('best_outlier_model_waug_RGB256_b64.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=0)
X = images.astype(np.float16) # To save memory
# Split the dataset into a training and testing set
X_train, X_test = train_test_split(X, test_size=0.1, random_state=42)
del X
X_train = X_train.astype(np.float16)
X_test = X_test.astype(np.float16)
# Data Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image
import random
def random_stretch(image, target_size=(256, 256)):
img = Image.fromarray((image * 255).astype('uint8')) # Convert from array to PIL Image
original_size = img.size # (width, height)
stretch_factor = 1.1 # Define stretch factor
axis = random.choice(['width', 'height'])
if axis == 'width':
new_size = (int(original_size[0] * stretch_factor), original_size[1])
else:
new_size = (original_size[0], int(original_size[1] * stretch_factor))
stretched_img = img.resize(new_size, Image.Resampling.LANCZOS)
resized_img = stretched_img.resize(target_size, Image.Resampling.LANCZOS)
return np.array(resized_img) / 255.0 # Convert back to array and scale to [0, 1]
# Define separate ImageDataGenerators for training and test sets
#datagen_params = dict(zca_whitening=True,)
datagen_params = dict()
train_datagen = ImageDataGenerator(
**datagen_params,
rescale=1./255,
rotation_range=20,
width_shift_range=0.1,
height_shift_range=0.1,
shear_range=0.1,
zoom_range=0.1,
horizontal_flip=True,
fill_mode='nearest',
preprocessing_function=lambda x: random_stretch(x, target_size=(img_size, img_size)) # Apply random stretching and resize
)
test_datagen = ImageDataGenerator(
**datagen_params,)
autoencoder.fit(train_datagen.flow(X_train, X_train, batch_size=64),
epochs=100,
shuffle=True,
validation_data=test_datagen.flow(X_test, X_test, batch_size=64),
callbacks=[early_stopping, reduce_lr, model_checkpoint])
Then I make predictions it on an external sample.
reconstructed_images = autoencoder.predict(images_to_predict)
errors = np.mean(np.abs(images_to_predict - reconstructed_images), axis=(1, 2, 3))
threshold = np.percentile(errors, 90) # Set threshold as the 90th percentile of error
anomalies = errors > threshold
The model works pretty well. When I switch to binary cross-entropy and sigmoid activation on the last layer, it performs poorly. I'm sorry that I'm asking such a broad question, but do you have any hints why my intuition is wrong?
I was expecting to achieve a better performance for binary cross-entropy loss and sigmoid activation on the last layer.