Using PyGAD for feature selection

Question

Using PyGAD for feature selection

368 Views Asked by SimplyhumanRight At 17 August 2025 at 22:59

I'm trying to create a Python script for feature selection using PyGAD

My code is shown below, nonetheless, it is returning that all the features are the best subset. How can I be sure it is correct?

import pygad
import numpy

from sklearn.model_selection import train_test_split, cross_val_score
from src.learner_params import target_column, model_features

from sklearn.datasets import load_breast_cancer
from lightgbm import LGBMClassifier as lgbm
from sklearn.metrics import roc_auc_score


bc = load_breast_cancer()
bst = lgbm(random_state = 42,n_estimators=1, max_depth=2)

function_inputs = bc.feature_names


X, y = bc.data,bc.target
X = pd.DataFrame(X, columns=bc.feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42)



def fitness_func(ga_instance, solution, solution_idx):
    mask = np.where(solution ==1, True, False)
    selected_features = np.array(bc.feature_names)[mask]
    X_tmp = X_train.loc[:,selected_features]
    score = cross_val_score(bst, X_tmp, y_train, scoring="roc_auc", cv = 2).mean()
    fitness = score
    return fitness



m = len(bc.feature_names)
fitness_function = fitness_func
gene_space = np.full(m,1)

num_generations = 100
num_parents_mating = 4

sol_per_pop = 8
num_genes = m

init_range_low = -2
init_range_high = 5

parent_selection_type = "sss"
keep_parents = 2

crossover_type = "single_point"

mutation_type = "random"
mutation_percent_genes = 100

ga_instance = pygad.GA(gene_space=gene_space,
                       num_generations=num_generations,
                       num_parents_mating=num_parents_mating,
                       fitness_func=fitness_function,
                       sol_per_pop=sol_per_pop,
                       num_genes=num_genes,
                       keep_parents=keep_parents,
                       crossover_type=crossover_type,
                       mutation_type=mutation_type,
                       mutation_percent_genes=mutation_percent_genes)

ga_instance.run()


solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))

Original Q&A

There are 1 best solutions below

**SimplyhumanRight** · Accepted Answer

I have found the solution.

The thing I added was a random initialisation of the subset of features. After evaluating the model with the subset and comparing it with the model trained on the complete set of features, we even observed an increase in performance.

# Performance with subset of features:
# 0.9440559440559441

# Performance with all the features:
# 0.9370629370629371

import pygad
import numpy

from sklearn.model_selection import train_test_split, cross_val_score
from src.learner_params import target_column, model_features

from sklearn.datasets import load_breast_cancer
from lightgbm import LGBMClassifier as lgbm
from sklearn.metrics import roc_auc_score

from numpy.random import RandomState
seed = 1234
state = RandomState(seed)


bc = load_breast_cancer()
bst = lgbm(random_state = seed)

function_inputs = bc.feature_names


X, y = bc.data,bc.target
X = pd.DataFrame(X, columns=bc.feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=seed)



def fitness_func(ga_instance, solution, solution_idx):
    mask = np.array(solution, dtype = bool)
    selected_features = np.array(bc.feature_names)[mask]
    X_tmp = X_train.loc[:,selected_features]
    score = cross_val_score(bst, X_tmp, y_train, scoring="roc_auc", cv = 5).mean()
    fitness = score
    return fitness



m = len(bc.feature_names)
fitness_function = fitness_func
# initialize with a random subset of features
gene_space = state.random_integers(0,1,m)

num_generations = 30
num_parents_mating = 2

sol_per_pop = 2
num_genes = m


parent_selection_type = "sss"
keep_parents = 2
crossover_type = "single_point"
mutation_type = "random"
mutation_percent_genes = 15

ga_instance = pygad.GA(gene_space=gene_space,
                       num_generations=num_generations,
                       num_parents_mating=num_parents_mating,
                       fitness_func=fitness_function,
                       sol_per_pop=sol_per_pop,
                       num_genes=num_genes,
                       keep_parents=keep_parents,
                       crossover_type=crossover_type,
                       mutation_type=mutation_type,
                       mutation_percent_genes=mutation_percent_genes,
                       random_seed=seed,
                       )

ga_instance.run()


solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))
print(f"Number of features selected = {sum(solution)}")




model = bst.fit(X_train, y_train)
print(f"Performance with all the features:")
model.score(X_test, y_test)


model = bst.fit(X_train.loc[:,selected_], y_train)
print(f"Performance with subset of features:")
model.score(X_test.loc[:,selected_], y_test)

Using PyGAD for feature selection

There are 1 best solutions below

Related Questions in PYTHON

Related Questions in MACHINE-LEARNING

Related Questions in GENETIC-ALGORITHM

Related Questions in FEATURE-SELECTION

Related Questions in PYGAD

Trending Questions

Popular # Hahtags

Popular Questions