TypeError: Singleton array in nested stratified cross-validation

36 Views Asked by At

I am running a nested CV with stratification at both inner and loops on a binary classification task using a small sample. I am able to run these models without stratification on a dataset with balanced class distribution for Task A, but on the version of this dataset with unbalanced class distribution (Task B), by introducing stratification, it seems to pass an empty 'y' to split.

Here is the error:

File "/xxxxxxxx/main.py", line 68, in <module>
all models = run model selection (model name, X scaled df, y encoded,
feature_sets, model_config, training_strategy, scoring)
xxxxxxxx/model_selection.py" , line 40, in run model selection for train_index, test_index in outer_cv.split(X selected)
File "/share/pkg.&/python3/3.10.12/install/lib/python3.10/site-packages/sklearn/model_selection/_split.py", line 1507, in split for train_index, test_index in cv.split(X, y, groups) :
File
"share/pko.8/python3/3.10.12/install/lib/python3.io/site-packages/sklearn/model_selection/_split.py",line 796, in split
y = check_array (y, input_name="y"
, ensure_2d=False, dtype=None)
File "/share/pkg.8/python373.10.12/install/Tib/python3.10/site-packages/sklearn/utils/validation.py",line 967, in check_array
n samples = num samples (array)
File "/share/pkg .87python3/3.10.12/install/lib/python3.10/site-packages/sklearn/utils/validation.py",line347,in_num_samples raise TypeError (
TypeError: Singleton array array (None, type=object) cannot be considered a valid collection.

Relevant code snippets:

from itertools import chain, combinations
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from config import specificity_scorer


def all_combinations(feature_sets):
    return chain(*map(lambda x: combinations(feature_sets.values(), x), range(1, len(feature_sets) + 1)))

def run_model_selection(model_name, X, y_encoded, feature_sets, config, training_strategy, scoring):
    top_models = []
    all_models = {}
    scaler = MinMaxScaler()

    outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)

    for feature_set_combination in all_combinations(feature_sets):
        selected_features = [item for sublist in feature_set_combination for item in sublist]
        if not set(selected_features).issubset(X.columns):
            continue

        X_selected = X[selected_features]
        if not isinstance(X_selected, pd.DataFrame):
            X_selected = pd.DataFrame(X_selected, columns=selected_features)

        metrics_summary = {
            'accuracy': [],
            'f1': [],
            'precision': [],
            'recall': [],
            'roc_auc': [],
            'mcc': [],
            'specificity': []
        }

        for train_index, test_index in outer_cv.split(X_selected):
            X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
            y_train, y_test = y_encoded[train_index], y_encoded[test_index]

            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            grid_search = GridSearchCV(
                estimator=config['model'],
                param_grid=config['param_grid'],
                scoring=scoring,
                cv=training_strategy,
                n_jobs=-1,
                refit='F1',
                return_train_score=True
            )
            grid_search.fit(X_train_scaled, y_train)
            best_model = grid_search.best_estimator_
            predictions = best_model.predict(X_test_scaled)
            if 'roc_auc' in scoring and hasattr(best_model, "predict_proba"):
                probas = best_model.predict_proba(X_test_scaled)
                roc_auc_val = roc_auc_score(y_test, probas[:, 1])
                metrics_summary['roc_auc'].append(roc_auc_val

            metrics_summary['accuracy'].append(accuracy_score(y_test, predictions))
            metrics_summary['f1'].append(f1_score(y_test, predictions, average='macro'))
            metrics_summary['precision'].append(precision_score(y_test, predictions, average='macro'))
            metrics_summary['recall'].append(recall_score(y_test, predictions, average='macro'))
            metrics_summary['mcc'].append(matthews_corrcoef(y_test, predictions))
            metrics_summary['specificity'].append(specificity_scorer(best_model, X_test_scaled, y_test))

        # After completing the outer fold loop and having the final metrics summary
        model_key = f"{model_name}_{','.join(selected_features)}"
        all_models[model_key] = {
            'model_params': grid_search.best_estimator_.get_params(),  # Adjusted to 'model_params'
            'selected_features': ', '.join(selected_features),  # Adjusted to 'selected_features'
            'Metrics': {  # Keeping 'Metrics' for detailed scoring information
                metric: {
                    'Mean': np.mean(values),
                    'Std': np.std(values)
                } for metric, values in metrics_summary.items()
            }
        }

    return all_models

My config.py is applying StratifiedKFold(n_splits=4, shuffle=True for the inner loop.

What might be going wrong?

0

There are 0 best solutions below