I am working on a classification problem where I try to generate synthetic data for both the Majority and Minority classes,as i want to train my model on synthetic data and test on actual data, i am using below code, but i am unable to generate the synthetic data for majority class, how can I do it?
Code:
# columns for each type
continuous_cols = ['amt', 'BAL', 'MOB', 'TPOP']
boolean_cols = ['EVER_L3M','EVER_L3M', 'EVER_L6M']
target_col = 'Target'
import pandas as pd
from imblearn.over_sampling import SMOTENC, RandomOverSampler
def oversample_both_classes(df, continuous_cols, boolean_cols, target_col):
# Separate features and target
X = df.drop(target_col, axis=1)
y = df[target_col]
# Calculate boolean and continuous indices
boolean_indices = [X.columns.get_loc(col) for col in boolean_cols]
continuous_indices = [X.columns.get_loc(col) for col in continuous_cols]
categorical_indices = boolean_indices + continuous_indices
# Oversample minority class with SMOTENC
smotenc = SMOTENC(random_state=42, categorical_features=boolean_indices)
X_resampled_minority, y_resampled_minority = smotenc.fit_resample(X, y)
# Oversample majority class with RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled_majority, y_resampled_majority = ros.fit_resample(X, y)
X_resampled = pd.concat([pd.DataFrame(X_resampled_minority, columns=X.columns),
pd.DataFrame(X_resampled_majority, columns=X.columns)])
y_resampled = pd.concat([pd.Series(y_resampled_minority), pd.Series(y_resampled_majority)])
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df[target_col] = y_resampled
return resampled_df
oversampled_df_PL = oversample_both_classes(df, continuous_cols, boolean_cols, target_col)