LGBMClassifier roc_auc problem using gridSearchCV and early_stopping via BaseEstimator

381 Views Asked by At

i want to integrate an LGBMClassifier to existing code. the code call fit(X,y), while LGBMClassifier will need fit(X, y, eval_set, callbacks, eval_metric). i'm trying to encapsulate eval_set, callbacks, eval_metric in a BaseEstimator to expose a uniform api

without encapsulation, i got this working :

from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV

X, y = load_breast_cancer(return_X_y=True)
print(X.shape, y.shape)
# (442, 10) (442,)

pipe = Pipeline([
    ('pca', PCA(5)),
    ('lgbm', LGBMClassifier())
])

param_grid = {
    'pca__n_components': [3, 5],
    'lgbm__learning_rate': [0.05, 0.1]
}

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.1)
lgbm__param_fit = {
    'lgbm__eval_set' : [(X_eval, y_eval)], 
    'lgbm__callbacks' : [early_stopping(50, first_metric_only=True)],
    'lgbm__eval_metric' : 'auc',
}

cv = StratifiedKFold(n_splits= 5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='roc_auc', n_jobs=1, verbose=1, error_score='raise')
grid.fit(X_train, y_train, **lgbm__param_fit)
print(grid.best_params_)

i try to encapsulate this logic in this class :

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier, early_stopping

class EarlyStopEstimator(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator=LGBMClassifier(), stopping_rounds=50, eval_frac=0.1, eval_metric='auc', **estimator_params):
        self.estimator = estimator
        self.set_params(**estimator_params)
        self.stopping_rounds = stopping_rounds
        self.eval_frac = eval_frac
        self.eval_metric=eval_metric

    def set_params(self, **params):
        self.estimator.set_params(**params)
        return self

    def get_params(self, **params):
        return self.estimator.get_params(**params)

    def fit(self, X, y):
        if self.eval_frac > 0:
            X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=self.eval_frac)
            param_eval =  {
                'eval_set' : [(X_eval, y_eval)], 
                'callbacks' : [early_stopping(self.stopping_rounds, first_metric_only=True)],
                'eval_metric' :self.eval_metric,
            }
        else:
            X_train, y_train = X, y
            param_eval = {}
        self.estimator.fit(X_train, y_train, **param_eval)
        return self

    def predict(self, X):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

that i try to run with :


from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV

x, y = load_breast_cancer(return_X_y=True)
print(x.shape, y.shape)
# (442, 10) (442,)

pipe = Pipeline([
    ('pca', PCA(5)),
    ('lgbm', EarlyStopEstimator())
])

param_grid = {
    'pca__n_components': [3, 5],
    'lgbm__learning_rate': [0.05, 0.01]
}

cv = StratifiedKFold(n_splits= 5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='roc_auc', n_jobs=1, verbose=1, error_score='raise')
grid.fit(x, y) # <-- that's what i want to do
print(grid.best_params_)

this code work for the first fit, then fail at second one of 20 :

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27]    valid_0's auc: 0.99619  valid_0's binary_logloss: 0.233344
Evaluated only: auc

---------------------------------------------------------------------------
 AttributeError
'EarlyStopEstimator' object has no attribute 'decision_function'
 [...]
 During handling of the above exception, another exception occurred:
 --> 716     return self.steps[-1][1].classes_
'EarlyStopEstimator' object has no attribute 'classes_'

this code work if i switch scoring='roc_auc' to scoring='neg_mean_absolute_error'. if i try to add this code to the class EarlyStopEstimator :

    def decision_function(self, X):
        return self.estimator.decision_function(X)

i get the error :

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27]    valid_0's auc: 0.99619  valid_0's binary_logloss: 0.233344
Evaluated only: auc

---------------------------------------------------------------------------
 AttributeError
'LGBMClassifier' object has no attribute 'decision_function'
 [...]
 During handling of the above exception, another exception occurred:
 --> 716     return self.steps[-1][1].classes_
'EarlyStopEstimator' object has no attribute 'classes_'

What should i change in EarlyStopEstimator so i can use with grid.fit(x, y)

1

There are 1 best solutions below

0
On

You can just set an attribute classes_ for EarlyStopEstimator to delegate out to the underlying estimator. Either

    def fit(self, X, y):
        ...
        self.estimator.fit(X_train, y_train, **param_eval)
        self.classes_ = self.estimator.classes_
        return self

or add

    @property
    def classes_(self):
        return self.estimator.classes_