Tuning hyperparameters after multipile runs

53 Views Asked by At

I wrote a classifier that uses LGBMClassifier. This is the code:

from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import GroupShuffleSplit
from skopt import BayesSearchCV

DEFAULT_PARAMS = {
            'class_weight': [None, 'balanced'],
            'boosting_type': ['gbdt', 'goss', 'dart'],
            'num_leaves': list(range(30, 150)),
            'learning_rate': [0.01,0.05, 0.1 ,0.5],
            'subsample_for_bin': [20000,50000,100000,120000,150000],
            'min_child_samples': [20,50,100,200,500],
            'colsample_bytree': [0.6,0.8,1],
            "max_depth": (5,100, 'uniform'),
            'lambda_l1': (0.7, 1),
            'lambda_l2': (0.7, 1)
        }

class LightgbmClassifier:
    def __init__(self, train_df, prediction_column, test_size = 0.3):
        self. Search = None
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(train_df,
                                                                                    train_df[prediction_column],
                                                                                    test_size=test_size, random_state=7)
        

        self.X_train = self.X_train.drop([prediction_column], axis=1)
        self.X_test = self.X_test.drop([prediction_column], axis=1)

    def tune_hyper_parameters(self, params=DEFAULT_PARAMS, scoring='neg_mean_squared_error', kfold=10, n_iter=25):
        Kfold = KFold(n_splits=kfold)  
        lgbm = LGBMClassifier()
        self.search = BayesSearchCV(estimator=lgbm,
                                    search_spaces=params,
                                    scoring=scoring,
                                    n_iter=n_iter, 
                                    cv=Kfold,
                                    verbose=0)

    def train(self):
        result = self.search.fit(self.X_train, self.y_train)
        print("Best parameters:", self.search.best_params_)
        print("Best accuracy: ", self.search.best_score_)

        return result
    

I call it with:

lgbm_classifier = LightgbmClassifier(df = df.copy(), prediction_column = 'type')
lgbm_classifier.tune_hyper_parameters(scoring='accuracy')
lgbm_model = lgbm_classifier.train()
# Evaluate test score

When I run it a few times, the BayesSearchCV uses different hyperparameters with different scores.

also, changing the DEFAULT_PARAMS dict values (like replacing 'subsample_for_bin': [20000,50000,100000,120000,150000], with 'subsample_for_bin': [15000,50000,100000,120000,140000]) , and run it again, gives different scores. as the same, also changing other params, as test_size, n_iter, cv has the same effect.

My questions are:

  1. Is it a good strategy to run the model multiple times, and find the hyperparams with the best scores, and then run it with them? for example, after running multiple times, I got the best score with test_size=0.2, subsample_for_bin: 50,000, 'min_child_samples': 20, etc. Now I will not use BayesSearchCV anymore to tune them, but just select them as they are?
  2. What if there is an improvement only in the test score or only in the cross-validation score?
0

There are 0 best solutions below