I have a tabular dataset (very small for ML modelling) that consists of 19 samples which are divided in three different classes (Class_1 = 4, Class_2 = 10, Class_3 = 5) with 1828 features with numeric values. I want to perform multi-class classification on the data. In order to deal with the imbalance in the dataset, I used SMOTE for making minority classes balanced as majority class exclusively on the training set. For the classification, I used both simple model Linear Discriminant Analysis and used cross-validation to train the model. But I am still unable to make an appropriate model having efficient metric scores.
- For simpler model, I used Linear Discriminant Analysis, I tried to add some noise to the data to avoid overfitting due to the small size of data.I get very optimistic values (1.00) for all the metrics (precision, recall, F1-score and AUC) after the cross validation on the LDA model. Further on the test dataset I get the similar I used the following code:
x = df.iloc[:, 1:].values
y_data = df.iloc[:, 0].values
#Encoding the target variable
labelencoder = LabelEncoder()
y_encode = labelencoder.fit_transform(y_data)
#Assuming 'X' and 'y' are your features and labels
y_binarized = label_binarize(y_encode, classes=np.unique(y_encode))
n_classes = y_binarized.shape[1]
#Scaling the features
scaler_xlsx = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler_xlsx.fit_transform(x)
#Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encode, test_size=0.5, random_state=0, stratify=y_encode)
#Assuming y_train contains the original class labels
min_class_count = min([sum(y_train == i) for i in np.unique(y_train)])
#Now, use this to set k_neighbors in SMOTE, considering there should be at least 2 samples
#(SMOTE's requirement) to use as the nearest neighbors.
k_neighbors = max(min_class_count - 1, 1) # Ensuring k_neighbors is at least 1
#Apply SMOTE on the training data
smote = SMOTE(k_neighbors = k_neighbors,random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
#Add noise to the training dataset
def add_gaussian_noise(X, noise_level=0.5):
mean = 0
# Generate Gaussian noise
noise = np.random.normal(mean, noise_level, X.shape)
# Add the Gaussian noise to the dataset
X_noisy = X + noise
return X_noisy
#Assuming X_train and X_test are your original datasets
noise_level = 0.5
#Add Gaussian noise to the training and test datasets
X_train_noisy = add_gaussian_noise(X_train_res, noise_level=noise_level)
X_test_noisy = add_gaussian_noise(X_test, noise_level=noise_level)
#Adjusting n_splits based on the smallest class size
unique, counts = np.unique(y_train_res, return_counts=True)
min_class_size = min(counts)
n_splits = max(min(min_class_size, 5), 2) # Use the smaller of 5 or the smallest class size
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
#Variables to hold the sum of the confusion matrices
fpr = dict()
tpr = dict()
roc_auc = dict()
#Iterate over folds, train the LDA and calculate ROC/AUC
for train_idx, test_idx in cv.split(X_train_noisy, y_train_res):
# Fit the LDA model and get the score for the test fold
lda = LDA()
lda.fit(X_train_noisy[train_idx], y_train_res[train_idx])
y_score = lda.predict_proba(X_train_noisy[test_idx])
# Compute ROC curve and AUC for each class
y_test_bin = label_binarize(y_train_res[test_idx], classes=np.unique(y_encode))
n_classes = y_test_bin.shape[1]
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
#Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
#Aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
#Then interpolate all ROC curves at these points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
#Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
#Evaluation on the test dataset
y_test_pred = lda.predict_proba(X_test)
y_test_binarized = label_binarize(y_test, classes=np.unique(y_encode))
n_classes_1 = y_test_binarized.shape[1]
fpr_test_1 = dict()
tpr_test_1 = dict()
roc_auc_test_1 = dict()
for i in range(n_classes_1):
fpr_test_1[i], tpr_test_1[i], _ = roc_curve(y_test_binarized[:, i], y_test_pred[:, i])
roc_auc_test_1[i] = auc(fpr_test_1[i], tpr_test_1[i])
#Calculate micro-average ROC curve and ROC area for the test set
fpr_test_1["micro"], tpr_test_1["micro"], _ = roc_curve(y_test_binarized.ravel(), y_test_pred.ravel())
roc_auc_test_1["micro"] = auc(fpr_test_1["micro"], tpr_test_1["micro"])
#Aggregate all false positive rates
all_fpr_1 = np.unique(np.concatenate([fpr_test_1[i] for i in range(n_classes_1)]))
#Then interpolate all ROC curves at these points
mean_tpr_1 = np.zeros_like(all_fpr_1)
for i in range(n_classes_1):
mean_tpr_1 += np.interp(all_fpr_1, fpr_test_1[i], tpr_test_1[i])
#Finally average it and compute AUC
mean_tpr_1 /= n_classes_1
fpr_test_1["macro"] = all_fpr_1
tpr_test_1["macro"] = mean_tpr_1
roc_auc_test_1["macro"] = auc(fpr_test_1["macro"], tpr_test_1["macro"])