I'm trying to create a Python script for feature selection using PyGAD
My code is shown below, nonetheless, it is returning that all the features are the best subset. How can I be sure it is correct?
import pygad
import numpy
from sklearn.model_selection import train_test_split, cross_val_score
from src.learner_params import target_column, model_features
from sklearn.datasets import load_breast_cancer
from lightgbm import LGBMClassifier as lgbm
from sklearn.metrics import roc_auc_score
bc = load_breast_cancer()
bst = lgbm(random_state = 42,n_estimators=1, max_depth=2)
function_inputs = bc.feature_names
X, y = bc.data,bc.target
X = pd.DataFrame(X, columns=bc.feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,
y,
random_state=42)
def fitness_func(ga_instance, solution, solution_idx):
mask = np.where(solution ==1, True, False)
selected_features = np.array(bc.feature_names)[mask]
X_tmp = X_train.loc[:,selected_features]
score = cross_val_score(bst, X_tmp, y_train, scoring="roc_auc", cv = 2).mean()
fitness = score
return fitness
m = len(bc.feature_names)
fitness_function = fitness_func
gene_space = np.full(m,1)
num_generations = 100
num_parents_mating = 4
sol_per_pop = 8
num_genes = m
init_range_low = -2
init_range_high = 5
parent_selection_type = "sss"
keep_parents = 2
crossover_type = "single_point"
mutation_type = "random"
mutation_percent_genes = 100
ga_instance = pygad.GA(gene_space=gene_space,
num_generations=num_generations,
num_parents_mating=num_parents_mating,
fitness_func=fitness_function,
sol_per_pop=sol_per_pop,
num_genes=num_genes,
keep_parents=keep_parents,
crossover_type=crossover_type,
mutation_type=mutation_type,
mutation_percent_genes=mutation_percent_genes)
ga_instance.run()
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))
I have found the solution.
The thing I added was a random initialisation of the subset of features. After evaluating the model with the subset and comparing it with the model trained on the complete set of features, we even observed an increase in performance.