using SMOTE to balance the dataset with ktrain package

64 Views Asked by At

I have a classification project for this i am using ktrain & Bert with an imbalance dataset for this i used SMOTE after splitting the dataset into training and test the problem is that when i tried to oversample the dataset it crash and display the below error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-27-780eef8e5a1f> in <module>()
      7 
      8 oversample = SMOTE(random_state = 42)
----> 9 x_smote, y_smote = oversample.fit_sample(x_train, y_train)
     10 # x_smote,y_smote = oversample.fit_resample(vect_df, y_train["Sentiment"])
     11 print("shape x before SMOTE: {}".format(x_train.shape))

5 frames
/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    639         if not allow_nd and array.ndim >= 3:
    640             raise ValueError("Found array with dim %d. %s expected <= 2."
--> 641                              % (array.ndim, estimator_name))
    642 
    643         if force_all_finite:

ValueError: Found array with dim 3. Estimator expected <= 2.

code:

import pandas as pd 
import numpy as np

import six
import sys
sys.modules['sklearn.externals.six'] = six
from imblearn.over_sampling import SMOTE# for inbalance dataset

import ktrain
from ktrain import text
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn import metrics

data_train = df.sample(frac=0.85, random_state=42)
data_test = df.drop(data_train.index)
len(data_train), len(data_test)


(x_train,y_train),(x_test,y_test),preprocess =text.texts_from_df(train_df=data_train,
                   text_column="Tweet",
                   label_columns = "Sentiment",
                   val_df = data_test,
                   maxlen = 400,
                   preprocess_mode = 'bert')

# Solving inbalanced dataset using SMOTE 
# Note: just oversampling the training data 

oversample = SMOTE(random_state = 42)
x_smote, y_smote = oversample.fit_sample(x_train, y_train)
print("shape x before SMOTE: {}".format(x_train.shape))
print("shape x after SMOTE: {}".format(x_smote.shape))
print("balance of targets feild %")
type(y_smote)# it seams that this is numpy array 
y_smote = pd.DataFrame(y_smote)# convert it to dataframe 
type(y_smote) # check the new type 
y_smote.value_counts(normalize = True)*100
0

There are 0 best solutions below