i want to integrate an LGBMClassifier to existing code. the code call fit(X,y), while LGBMClassifier will need fit(X, y, eval_set, callbacks, eval_metric). i'm trying to encapsulate eval_set, callbacks, eval_metric in a BaseEstimator to expose a uniform api
without encapsulation, i got this working :
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV
X, y = load_breast_cancer(return_X_y=True)
print(X.shape, y.shape)
# (442, 10) (442,)
pipe = Pipeline([
('pca', PCA(5)),
('lgbm', LGBMClassifier())
])
param_grid = {
'pca__n_components': [3, 5],
'lgbm__learning_rate': [0.05, 0.1]
}
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.1)
lgbm__param_fit = {
'lgbm__eval_set' : [(X_eval, y_eval)],
'lgbm__callbacks' : [early_stopping(50, first_metric_only=True)],
'lgbm__eval_metric' : 'auc',
}
cv = StratifiedKFold(n_splits= 5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='roc_auc', n_jobs=1, verbose=1, error_score='raise')
grid.fit(X_train, y_train, **lgbm__param_fit)
print(grid.best_params_)
i try to encapsulate this logic in this class :
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier, early_stopping
class EarlyStopEstimator(BaseEstimator, ClassifierMixin):
def __init__(self, estimator=LGBMClassifier(), stopping_rounds=50, eval_frac=0.1, eval_metric='auc', **estimator_params):
self.estimator = estimator
self.set_params(**estimator_params)
self.stopping_rounds = stopping_rounds
self.eval_frac = eval_frac
self.eval_metric=eval_metric
def set_params(self, **params):
self.estimator.set_params(**params)
return self
def get_params(self, **params):
return self.estimator.get_params(**params)
def fit(self, X, y):
if self.eval_frac > 0:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=self.eval_frac)
param_eval = {
'eval_set' : [(X_eval, y_eval)],
'callbacks' : [early_stopping(self.stopping_rounds, first_metric_only=True)],
'eval_metric' :self.eval_metric,
}
else:
X_train, y_train = X, y
param_eval = {}
self.estimator.fit(X_train, y_train, **param_eval)
return self
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
that i try to run with :
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV
x, y = load_breast_cancer(return_X_y=True)
print(x.shape, y.shape)
# (442, 10) (442,)
pipe = Pipeline([
('pca', PCA(5)),
('lgbm', EarlyStopEstimator())
])
param_grid = {
'pca__n_components': [3, 5],
'lgbm__learning_rate': [0.05, 0.01]
}
cv = StratifiedKFold(n_splits= 5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='roc_auc', n_jobs=1, verbose=1, error_score='raise')
grid.fit(x, y) # <-- that's what i want to do
print(grid.best_params_)
this code work for the first fit, then fail at second one of 20 :
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27] valid_0's auc: 0.99619 valid_0's binary_logloss: 0.233344
Evaluated only: auc
---------------------------------------------------------------------------
AttributeError
'EarlyStopEstimator' object has no attribute 'decision_function'
[...]
During handling of the above exception, another exception occurred:
--> 716 return self.steps[-1][1].classes_
'EarlyStopEstimator' object has no attribute 'classes_'
this code work if i switch scoring='roc_auc'
to scoring='neg_mean_absolute_error'
. if i try to add this code to the class EarlyStopEstimator :
def decision_function(self, X):
return self.estimator.decision_function(X)
i get the error :
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27] valid_0's auc: 0.99619 valid_0's binary_logloss: 0.233344
Evaluated only: auc
---------------------------------------------------------------------------
AttributeError
'LGBMClassifier' object has no attribute 'decision_function'
[...]
During handling of the above exception, another exception occurred:
--> 716 return self.steps[-1][1].classes_
'EarlyStopEstimator' object has no attribute 'classes_'
What should i change in EarlyStopEstimator
so i can use with grid.fit(x, y)
You can just set an attribute
classes_
forEarlyStopEstimator
to delegate out to the underlying estimator. Eitheror add