Generate synthetic data for majority and minority classes

61 Views Asked by At

I am working on a classification problem where I try to generate synthetic data for both the Majority and Minority classes,as i want to train my model on synthetic data and test on actual data, i am using below code, but i am unable to generate the synthetic data for majority class, how can I do it?

Code:

# columns for each type
continuous_cols = ['amt', 'BAL', 'MOB', 'TPOP']
boolean_cols = ['EVER_L3M','EVER_L3M', 'EVER_L6M']
target_col = 'Target'  

import pandas as pd
from imblearn.over_sampling import SMOTENC, RandomOverSampler

def oversample_both_classes(df, continuous_cols, boolean_cols, target_col):
    # Separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Calculate boolean and continuous indices
    boolean_indices = [X.columns.get_loc(col) for col in boolean_cols]
    continuous_indices = [X.columns.get_loc(col) for col in continuous_cols]

    
    categorical_indices = boolean_indices + continuous_indices

    # Oversample minority class with SMOTENC
    smotenc = SMOTENC(random_state=42, categorical_features=boolean_indices)
    X_resampled_minority, y_resampled_minority = smotenc.fit_resample(X, y)

    # Oversample majority class with RandomOverSampler
    ros = RandomOverSampler(random_state=42)
    X_resampled_majority, y_resampled_majority = ros.fit_resample(X, y)

    
    X_resampled = pd.concat([pd.DataFrame(X_resampled_minority, columns=X.columns),
                            pd.DataFrame(X_resampled_majority, columns=X.columns)])
    y_resampled = pd.concat([pd.Series(y_resampled_minority), pd.Series(y_resampled_majority)])

    
    resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
    resampled_df[target_col] = y_resampled

    return resampled_df


oversampled_df_PL = oversample_both_classes(df, continuous_cols, boolean_cols, target_col)
0

There are 0 best solutions below