i want to integrate an LGBMClassifier to existing code. the code call fit(X,y), while LGBMClassifier will need fit(X, y, eval_set, callbacks, eval_metric). i'm trying to encapsulate eval_set, callbacks, eval_metric in a BaseEstimator to expose a uniform api
without encapsulation, i got this working :
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV
X, y = load_breast_cancer(return_X_y=True)
print(X.shape, y.shape)
# (442, 10) (442,)
pipe = Pipeline([
    ('pca', PCA(5)),
    ('lgbm', LGBMClassifier())
])
param_grid = {
    'pca__n_components': [3, 5],
    'lgbm__learning_rate': [0.05, 0.1]
}
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.1)
lgbm__param_fit = {
    'lgbm__eval_set' : [(X_eval, y_eval)], 
    'lgbm__callbacks' : [early_stopping(50, first_metric_only=True)],
    'lgbm__eval_metric' : 'auc',
}
cv = StratifiedKFold(n_splits= 5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='roc_auc', n_jobs=1, verbose=1, error_score='raise')
grid.fit(X_train, y_train, **lgbm__param_fit)
print(grid.best_params_)
i try to encapsulate this logic in this class :
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier, early_stopping
class EarlyStopEstimator(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator=LGBMClassifier(), stopping_rounds=50, eval_frac=0.1, eval_metric='auc', **estimator_params):
        self.estimator = estimator
        self.set_params(**estimator_params)
        self.stopping_rounds = stopping_rounds
        self.eval_frac = eval_frac
        self.eval_metric=eval_metric
    def set_params(self, **params):
        self.estimator.set_params(**params)
        return self
    def get_params(self, **params):
        return self.estimator.get_params(**params)
    def fit(self, X, y):
        if self.eval_frac > 0:
            X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=self.eval_frac)
            param_eval =  {
                'eval_set' : [(X_eval, y_eval)], 
                'callbacks' : [early_stopping(self.stopping_rounds, first_metric_only=True)],
                'eval_metric' :self.eval_metric,
            }
        else:
            X_train, y_train = X, y
            param_eval = {}
        self.estimator.fit(X_train, y_train, **param_eval)
        return self
    def predict(self, X):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
that i try to run with :
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV
x, y = load_breast_cancer(return_X_y=True)
print(x.shape, y.shape)
# (442, 10) (442,)
pipe = Pipeline([
    ('pca', PCA(5)),
    ('lgbm', EarlyStopEstimator())
])
param_grid = {
    'pca__n_components': [3, 5],
    'lgbm__learning_rate': [0.05, 0.01]
}
cv = StratifiedKFold(n_splits= 5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='roc_auc', n_jobs=1, verbose=1, error_score='raise')
grid.fit(x, y) # <-- that's what i want to do
print(grid.best_params_)
this code work for the first fit, then fail at second one of 20 :
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27]    valid_0's auc: 0.99619  valid_0's binary_logloss: 0.233344
Evaluated only: auc
---------------------------------------------------------------------------
 AttributeError
'EarlyStopEstimator' object has no attribute 'decision_function'
 [...]
 During handling of the above exception, another exception occurred:
 --> 716     return self.steps[-1][1].classes_
'EarlyStopEstimator' object has no attribute 'classes_'
this code work if i switch scoring='roc_auc' to scoring='neg_mean_absolute_error'. if i try to add this code to the class EarlyStopEstimator :
    def decision_function(self, X):
        return self.estimator.decision_function(X)
i get the error :
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27]    valid_0's auc: 0.99619  valid_0's binary_logloss: 0.233344
Evaluated only: auc
---------------------------------------------------------------------------
 AttributeError
'LGBMClassifier' object has no attribute 'decision_function'
 [...]
 During handling of the above exception, another exception occurred:
 --> 716     return self.steps[-1][1].classes_
'EarlyStopEstimator' object has no attribute 'classes_'
What should i change in EarlyStopEstimator so i can use with grid.fit(x, y)
 
                        
You can just set an attribute
classes_forEarlyStopEstimatorto delegate out to the underlying estimator. Eitheror add