I need some help in solving a class imbalance problem while trying to perform binary classification on ECG and EEG data

28 Views Asked by At

I have a dataset that contains ECG and EEG values for 23 patients for 18 videos each. these videos are linked to a target emotion that I am trying to predict. now there are 8 target emotions as per the dataset but I have reclassified them down to 0 - not fear and 1 - fear. This leads to a class imbalance in the ratio 1:7 (fear:not fear). I am getting false accuracies in the 90% range due to this. I would really appreciate some help in fixing this.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score
from sklearn.model_selection import KFold
import seaborn as sns
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

def evaluate_cv_model(model, data, target, kFolds):
    a_score = cross_val_score(model, data, target, cv=kFolds, scoring='accuracy')
    accuracy = a_score.mean()
​
    return accuracy

def plot_confusionMatrix (clf, y_test, X_test):
    
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    sns.heatmap(cm, annot=True, fmt='.2f', cmap="Blues")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    report = classification_report(y_test, y_pred)
    plt.show()

    return report

def KNN(X_train, y_train, X_test, y_test, num_neighbors):
    
    # create the model
    KNN = KNeighborsClassifier(n_neighbors = num_neighbors)
    
    # fit the model
    KNN.fit(X_train, y_train)
    
    # get the accuracy
    test_accuracy = KNN.score(X_test, y_test)
    train_accuracy = KNN.score(X_train, y_train)
    
    # predict the values
    prediction = KNN.predict(X_test)
    
    return test_accuracy, train_accuracy, prediction, KNN

def SVM (X_train, y_train, X_test, y_test, kernel):

    # create the model for multiclass classification
    SVM = svm.SVC(kernel=kernel, C=1, decision_function_shape='ovo')
    
    # fit the model
    SVM.fit(X_train, y_train)
    
    # get the accuracy
    test_accuracy = SVM.score(X_test, y_test)
    train_accuracy = SVM.score(X_train, y_train)
    
    # predict the values
    prediction = SVM.predict(X_test)
    
    return test_accuracy, train_accuracy, prediction, SVM
def Logistic_Regression (X_train, y_train, X_test, y_test):
    
    # create the model with increased max_iter
    log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

    # fit the model
    log_reg.fit(X_train, y_train)

    # get the accuracy
    test_accuracy = log_reg.score(X_test, y_test)
    train_accuracy = log_reg.score(X_train, y_train)

    # predict the values
    prediction = log_reg.predict(X_test)

    return test_accuracy, train_accuracy, prediction, log_reg

ECG_data = pd.read_csv('/kaggle/input/ecgdata/binary_ECG.csv')
ECG_data.drop(['Unnamed: 0','video_name'], axis=1, inplace=True)
y_ECG = ECG_data.target
X_ECG = ECG_data.drop('target' , axis = 1)

# Applying SMOTE to handle class imbalance
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_ECG, y_train_ECG)

kf = KFold(n_splits=8, random_state=42 , shuffle = True)
X_train_ECG, X_test_ECG, y_train_ECG, y_test_ECG = train_test_split(X_ECG, y_ECG, test_size = 0.2, random_state = 42)
y_test_ECG = np.array(y_test_ECG)

def Evaluate (y_test, prediction):
    accuracy = accuracy_score(y_test, prediction)
    precision = precision_score(y_test, prediction, average='weighted')
    recall = recall_score(y_test, prediction, average='weighted')
    f1 = f1_score(y_test, prediction, average='weighted')
    return accuracy, precision, recall, f1

I have tried to train using SVM, KNN and logistic regression. While trying to implement SMOTE I am getting a convergence error for the logistic regression that simple does not seem to go away even if i increase max_iter to the maximum permissible limit in python.

0

There are 0 best solutions below