Error when using a custom probability metric in tidymodels

50 Views Asked by At

I want to tune a model using a custom class probability metric "pg", which stands for partial gini coefficient. I use it on data that exists of numerical predictors and a binary factor as class label (after preprocessing with the recipe). This is the tuning code:

xgb_folds <- train %>% vfold_cv(v=5)
    
xgb_model <- parsnip::boost_tree(
        mode = "classification",
        trees = tune(),
        tree_depth = tune(),
        learn_rate = tune(),
        loss_reduction = tune()
      ) %>%
      set_engine("xgboost")
    
xgb_wf <- workflow() %>%
      add_recipe(TREE_recipe) %>%
      add_model(xgb_model)
    
xgboost_tuned <- tune::tune_grid(
      object = xgb_wf,
      resamples = xgb_folds,
      grid = hyperparameters_XGB_tidy,
      metrics = metric_set(pg),
      control = tune::control_grid(verbose = TRUE)

The code above worked when I set the metric in tune_grid to roc_auc. When using pg however I get this warning: Warning message: All models failed. Run 'show_notes(.Last.tune.result)' for more information. The .Last.tune.result contains this error:

unique notes:
────────────────────────────────────────────────────────────────────────────────────────────────────
Error in `metric_set()`:
! Failed to compute `pg()`.
Caused by error in `UseMethod()`:
! no applicable method for 'pg' applied to an object of class "c('grouped_df', 'tbl_df', 'tbl', 'data.frame')"

This is the yardstick implementation for pg I tried: (running pg_vec on class probabilities and label vectors worked as expected)

# partialGini for tidymodels
library(tidymodels, rlang)

pg_impl <- function(truth, estimate, case_weights = NULL) {

  sorted_indices <- order(estimate, decreasing = TRUE)
  sorted_probs <- estimate[sorted_indices]
  sorted_actuals <- truth[sorted_indices]

  # Select subset with PD < 0.4
  subset_indices <- which(sorted_probs < 0.4)
  subset_probs <- sorted_probs[subset_indices]
  subset_actuals <- sorted_actuals[subset_indices]

  # Check if there are both positive and negative cases in the subset
  if (length(unique(subset_actuals)) > 1) {
    # Calculate ROC curve for the subset
    roc_subset <- pROC::roc(subset_actuals, subset_probs,
                            direction = "<", quiet = TRUE)
    # Calculate AUC for the subset
    partial_auc <- pROC::auc(roc_subset)
    # Calculate partial Gini coefficient
    (2 * partial_auc - 1)
  } else return(NA)
}
    

pg_vec <- function(truth, estimate, estimator = NULL, na_rm = TRUE, case_weights = NULL, ...) {
  abort_if_class_pred(truth)
  
  estimator <- finalize_estimator(truth, estimator)
  check_prob_metric(truth, estimate, case_weights, estimator)
  
  if (na_rm) {
    result <- yardstick_remove_missing(truth, estimate, case_weights)
    
    truth <- result$truth
    estimate <- result$estimate
    case_weights <- result$case_weights
  } else if (yardstick_any_missing(truth, estimate, case_weights)) {
    return(NA_real_)
  }
  pg_impl(truth, estimate, case_weights = case_weights)
}

pg <- function(data, ...) { 
  UseMethod("pg")
}

pg <- new_prob_metric(pg, direction = "maximize")

pg.data.frame <- function(data, truth, ..., na_rm = TRUE) {
  prob_metric_summarizer(
    name = "pg",
    fn = pg_vec,
    data = data,
    truth = !! enquo(truth),
    ...,
    na_rm = na_rm)
}

0

There are 0 best solutions below