Ramdom Oversampling with Stratified KFold - Value Error

205 Views Asked by At

I have a data frame that looks like this. The data set is standardized using Standard scaler and dummy variables added for all categorical variables. It is now broken into train and test sets.

            amt    gender   city_pop    birth_year  distance        
153118  -0.786537   0.0    -0.318571    0.913779    -0.400876   
153226  -0.488455   0.0    -0.322397    0.741579     1.384297   
153228  0.437970    0.0    -0.329167    1.774776    -0.658839   
153303  -0.877627   0.0    -0.329656    1.258177    -1.100713   
153313  0.462143    1.0    -0.313817    1.372977     0.038791   

I am now trying to create a few models with this data (like Logistic Regression, Decision Tree and Random Forest), using RandomOverSampler and StratifiedKFold Cross Validatio. This is because the minority class on my target variable is only 0.3%.

I have already created models with the unbalanced data and it worked fine. But when I am trying to do the sampling, I'm getting the below error. Also included my code here.

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler

skf = StratifiedKFold(n_splits=5, random_state=None)

for fold, (train_index, test_index) in enumerate(skf.split(X,y), 1):
    X_train = X.reindex(index = train_index)
    y_train = y.reindex(index = train_index) 
    X_test = X.reindex(index = test_index)
    y_test = y.reindex(index = test_index)
    ROS = RandomOverSampler(sampling_strategy=0.5)
    X_over, y_over= ROS.fit_resample(X_train, y_train)
  
#Create Dataframe for X_over
X_over = pd.DataFrame(data=X_over,   columns=X_train.columns)

I am getting the below error.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-90-372645e869d1> in <module>
      4 oversample = RandomOverSampler(sampling_strategy=1)
      5 # fit and apply the transform
----> 6 X_over, y_over = oversample.fit_resample(X_train, y_train)

~\anaconda3\lib\site-packages\imblearn\base.py in fit_resample(self, X, y)
     73             The corresponding label of `X_resampled`.
     74         """
---> 75         check_classification_targets(y)
     76         arrays_transformer = ArraysTransformer(X, y)
     77         X, y, binarize_y = self._check_X_y(X, y)

~\anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
    178     y : array-like
    179     """
--> 180     y_type = type_of_target(y)
    181     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    182                       'multilabel-indicator', 'multilabel-sequences']:

~\anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
    301     if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
    302         # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
--> 303         _assert_all_finite(y)
    304         return 'continuous' + suffix
    305 

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
    104                     msg_err.format
    105                     (type_err,
--> 106                      msg_dtype if msg_dtype is not None else X.dtype)
    107             )
    108     # for object dtype data, we only check for NaNs (GH-13254)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
2

There are 2 best solutions below

0
On

it will be better to answer after seeing the data. But I would suggest to oversampling before cross-validation step. Please try it.

0
On

you can do in this way


auc_scores = []
best_clf = []

for i,(train_index, test_index) in enumerate(cv.split(X_train, y_train)):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    
    ros = RandomOverSampler(random_state=42)
    X_train_ros, y_train_ros = ros.fit_resample(X_train_cv, y_train_cv)
    print(X_train_ros.shape, y_train_ros.shape)
    
    grid_clf_acc = GridSearchCV(clf, param_grid = params, scoring = 'roc_auc', verbose = 2)
    grid_clf_acc.fit(X_train_ros, y_train_ros)
###clf is classifier

to train model



auc_scores = []
best_clf = []

for i,(train_index, test_index) in enumerate(cv.split(X_train, y_train)):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    
    ros = RandomOverSampler(random_state=42)
    X_train_ros, y_train_ros = ros.fit_resample(X_train_cv, y_train_cv)
    print(X_train_ros.shape, y_train_ros.shape)
    
    #perform grid search for the best C value on X_train_ros & y_train_ros
    clf = LogisticRegression()
    
    params = {
                'penalty' : ['l1', 'l2', 'none'],
                'C' : [10, 1.0, 0.1, 0.01]
          }
    
    grid_clf_acc = GridSearchCV(clf, param_grid = params, scoring = 'roc_auc', verbose = 2)
    grid_clf_acc.fit(X_train_ros, y_train_ros)
    
    #predict X_test_ros using the best_estimator_ and store it in y_pred
    y_pred = grid_clf_acc.best_estimator_.predict(X_test_cv)
    
    #calculate roc_auc_score for y_test_cv and y_pred_prob and store it in acc
    acc = roc_auc_score(y_test_cv, y_pred)
    
    #store the roc_auc_score for that value of C in auc_scores
    auc_scores.append(acc)
    
    #store the best_estimator_ for that value of C in best_clf
    best_clf.append(grid_clf_acc.best_estimator_)
    
    #print out the auc_scores[i] and best_clf[i]
    print(auc_scores[i])
    print(best_clf[i])