I am using GridSearchCV to find the best parameters (number of components) of a PLS-DA model (partial least squares discriminant analysis).
y_train
is a np array that looks like [1111....0000], so there are two classes (0 and 1). The classes are balanced.
However, I have this error (at line: gd_sr.fit(X_train, y_train)
) when I run this code: "Only one class present in y_true. ROC AUC score is not defined in that case." It looks like python is not seeing the two classes in y_train
.
Can anyone help me with this?
# Selecting data (training set)
X_train=data.loc[:, 'v1':'v546'].values
y_train=data['pregnancy']
y_train=np.where(y_train == 'Open',0,1)
# Parameter tuning using 10-fold cross-validation
grid_param={'n_components':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}
gd_sr = GridSearchCV(estimator=PLSRegression(),
param_grid=grid_param,
scoring='roc_auc',
cv=10,
n_jobs=-1)
gd_sr.fit(X_train, y_train)
best_parameters = gd_sr.best_params_ # find the best parameter (number of components)
print(best_parameters)
best_result = gd_sr.best_score_ # accuracy with the best parameter
print(best_result)
Here is the full error:
alueError Traceback (most recent call last)
<ipython-input-131-cd811da4b437> in <module>
20 n_jobs=-1) # -1 means that we use all available computing power (CPU)
21
---> 22 gd_sr.fit(X_train, y_train)
23
24 best_parameters = gd_sr.best_params_ # find the best parameter (number of components)
~/.local/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~/.local/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~/.local/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~/.local/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
713 for parameters, (train, test)
714 in product(candidate_params,
--> 715 cv.split(X, y, groups)))
716
717 if len(out) < 1:
~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1040
1041 with self._backend.retrieval_context():
-> 1042 self.retrieve()
1043 # Make sure that we get a last message telling us we are done
1044 elapsed_time = time.time() - self._start_time
~/.local/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
919 try:
920 if getattr(self._backend, 'supports_timeout', False):
--> 921 self._output.extend(job.get(timeout=self.timeout))
922 else:
923 self._output.extend(job.get())
~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
/usr/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
/usr/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
EDIT: I tried GridSearchCV with Logistic Regression and it works (without changing anything except the parameters), so the problems probably comes from the PLSRegression() algorithm.
Try using a different error metric. For example, you can use rmse, mse, r2. ROC AUC metrics is not define in your code.