Problem on Auto tune and custom resampling mlr3

92 Views Asked by At

I have some problems on my datas, I put again my native datas :

structure(list(PatientID = c("P1", "P1", "P1", "P1", "P1", "P1", "P2", "P2", "P3", "P4", "P5", "P5", "P5", "P5", "P5", "P6", "P6", "P6"), 
    LesionResponse = structure(c(2L, 1L, 2L, 2L, 1L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L),.Label = c("0", 
    "1"), class = "factor"), F1 = c(1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 
    0.625, 0.625, 0.625, 0.625, 0.625, 0.625, 1.25, 0.625, 0.625, 
    1.25, 1.25, 1.25), F2 = c(1, 5, 3, 2, 1, 1, 6, 9, 0, 5, 0, 4, 4, 4, 5, 2, 1, 1), F3 = c(0, 4, 3, 1, 1, 0, 3, 8, 4, 5, 0, 4, 4, 3, 5, 2, 0, 0), F4 = c(0, 9, 0, 7, 4, 0, 3, 8, 4, 5, 9, 1, 1, 3, 5, 3, 9, 0)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L), class = "data.frame")

I'm new in the mlr3verse and I'm trying to learn and apply the methods written in the mlr3 book or whatever I find on GitHub.

I've found a way to split my datas, by grouping and by stratifications...not an easy thing, but mlr3 doesn't provide a method to do the two things simultaneously whereas it's a very common thing in medical datas.

Here is my code for those interested :

set.seed(1234)

split_data <- function(data, patient_col = "PatientID", target_col = "LesionResponse", train_ratio = 0.6, valid_ratio = 0.2) {
repeat {
# Grouper les patients par PatientID et mélanger
patients <- data[[patient_col]]
patient_groups <- split(data, patients)
patient_groups_shuffled <- sample(patient_groups)

# Nombre de patients
n_train <- round(length(patient_groups_shuffled) * train_ratio)
n_valid <- round(length(patient_groups_shuffled) * valid_ratio)
train_groups <- patient_groups_shuffled[1:n_train]
valid_groups <- patient_groups_shuffled[(n_train + 1):(n_train + n_valid)]
test_groups <- patient_groups_shuffled[(n_train + n_valid + 1):length(patient_groups_shuffled)]

# Combinaison dans un data frame
train_set <- do.call(rbind, train_groups)
valid_set <- do.call(rbind, valid_groups)
test_set <- do.call(rbind, test_groups)

# Contrôle de la proportion de 0
proportion_train <- mean(train_set[[target_col]] == "0")
proportion_valid <- mean(valid_set[[target_col]] == "0")
proportion_test <- mean(test_set[[target_col]] == "0")

if (proportion_train >= 0.70 && proportion_train <= 0.75 && 
    proportion_valid >= 0.70 && proportion_valid <= 0.75 && 
    proportion_test >= 0.70 && proportion_test <= 0.75) {
  break
}
}
return(list(train = train_set, validation = valid_set, test = test_set))
}

split_data_sets <- split_data(data)
train_set <- split_data_sets$train
valid_set <- split_data_sets$validation
test_set <- split_data_sets$test
combined_data <- rbind(train_set, valid_set)

#Lignes en commun
train_rows_outer <- as.numeric(which(data$PatientID %in% combined_data$PatientID))
train_rows <-as.numeric(which(data$PatientID %in% train_set$PatientID))
valid_rows <- as.numeric(which(data$PatientID %in% valid_set$PatientID))
test_rows <- as.numeric(which(data$PatientID %in% test_set$PatientID))

Contrôle proportion

# Check the proportion of "0" in each set
train_proportion <- mean(train_set$LesionResponse == "0")
valid_proportion <- mean(valid_set$LesionResponse == "0")
test_proportion <- mean(test_set$LesionResponse == "0")

cat("Proportion of '0' in Train set: ", train_proportion, "\n")
cat("Proportion of '0' in Validation set: ", valid_proportion, "\n")
cat("Proportion of '0' in Test set: ", test_proportion, "\n")

# Check for common rows
common_train_valid <- intersect(train_set$PatientID, valid_set$PatientID)
common_train_test <- intersect(train_set$PatientID, test_set$PatientID)
common_valid_test <- intersect(valid_set$PatientID, test_set$PatientID)

# Print the number of rows in each set
cat("\nNumber of rows in Train set: ", nrow(train_set), "\n")
cat("Number of rows in Validation set: ", nrow(valid_set), "\n")
cat("Number of rows in Test set: ", nrow(test_set), "\n")

# Print the common rows
cat("\nCommon PatientIDs between Train and Validation sets:\n")
if (length(common_train_valid) == 0) {
  cat("Il n'y a aucun patient en commun.\n")
} else {
  cat("Ces IDs sont communs entre les deux groupes:\n")
  print(common_train_valid)
}

cat("\nCommon PatientIDs between Train and Test sets:\n")
if (length(common_train_test) == 0) {
  cat("Il n'y a aucun patient en commun.\n")
} else {
  cat("Ces IDs sont communs entre les deux groupes:\n")
  print(common_train_test)
}

cat("\nCommon PatientIDs between Validation and Test sets:\n")
if (length(common_valid_test) == 0) {
  cat("Il n'y a aucun patient en commun.\n")
} else {
  cat("Ces IDs sont communs entre les deux groupes:\n")
  print(common_valid_test)
}
data$PatientID <- NULL
train_set$PatientID <- NULL
valid_set$PatientID <- NULL
test_set$PatientID <- NULL

With this, I have no problem to create the three datasets. I normalized them, then, I created inner and outer resamples to have a test_set to estimate final performances.

data<-rbind(train_set, valid_set, test_set)
task <- as_task_classif(data, target = "LesionResponse")

# Création du OUTER resampling via customisation
resampling_outer = rsmp("custom")
resampling_outer$instantiate(task, train = list(train_rows_outer), test = list(test_rows))

#Création du INNER resampling via customisation
resampling_inner = rsmp("custom")
resampling_inner$instantiate(task, train = list(train_rows), test = list(valid_rows))

outer_resampling = resampling_outer

Then, I'm trying to create, tune and run some learners as this one (with the help of "autotune") :

#Auto tuning xgboost
learner_xgboost = lrn("classif.xgboost", nrounds = to_tune(1, 500))

at_xgboost = auto_tuner(
    tuner = tnr("random_search"),
    learner = learner_xgboost,
    resampling = resampling_inner,
    measure = msr("classif.ce"),
    terminator = trm("evals", n_evals = 5),
    store_models = TRUE
)

design = benchmark_grid(task = task, learner = at_xgboost, resampling = resampling_outer)
bmr = benchmark(design, store_models = TRUE)

#Extraction des learners optimisés dans la boucle externe :
data_bmr = as.data.table(bmr)
outer_learners = map(data_bmr$learner, "learner")

#Extraction des learners après optimisation boucle interne :
archives = extract_inner_tuning_archives(bmr)
inner_learners = map(archives$resample_result, "learners")

#Prédiction sur données nouvelles
at_xgboost$train(task, row_ids = train_rows_outer)
at_xgboost$predict(task, row_ids = test_rows)->prediction_xgboost
prediction_xgboost$confusion
autoplot(prediction_xgboost)

The results are bad...so I tried to study the influence of the imbalanced datas with this code :

# Définition
learner = at_xgboost

# 1. Augmentation de la classe minoritaire
graph_up <- po("classbalancing", id = "oversample", adjust = "minor",  reference = "minor", shuffle = FALSE, ratio = 2) %>>% at_xgboost

# 2. Réduction de la classe majoritaire d'un facteur '1/ratio'
graph_down <- po("classbalancing", id = "undersample", adjust = "major", reference = "major", shuffle = FALSE, ratio = 1/3) %>>% at_xgboost

# 3. Témoin négatif (pas de stratégie particulière)
graph_nop <- at_xgboost

graph_nop <-  GraphLearner$new(graph_nop)
graph_nop$predict_type <- "prob"

graph_down <- GraphLearner$new(graph_down)
graph_down$predict_type <- "prob"

graph_up <- GraphLearner$new(graph_up)
graph_up$predict_type <- "prob"

#Benchmark
bmr <- benchmark(design = benchmark_grid(task, learner = list(graph_nop, graph_up, graph_down), resampling_outer), store_models = TRUE)

#Résultats 
bmr$aggregate(msr("classif.ce"))
bmr$aggregate(msr("classif.auc"))

When I run the code, it doesn't work...I have a warning of this type just when the code reach the bmr part :

Error: Test set 1 of inner resampling 'custom' contains row ids not present in task 'task_test': {651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824}

When I check the train_rows, valid_rows and test_rows, they are all present in the "data" set, and when I check if these rows are in the resampling_inner or outer, it's all ok.

I don't really know where my mistake(s) is/are... If someone could help me ?

EDIT : The piece of set I gave has too low informations to run the code correctly in the sets creation part.

1

There are 1 best solutions below

1
On

Is it possible that, if the train() function of my at (autotune) allows to tune on the task_tune, I have no need to precise rows...

Yes. Just use at$train(task_tuning). Otherwise the test set is missing for the auto tuner.

i.e my at trains and tune on my task_tuning with the resampling customized (what I want), and then, predict on the task_test, and particularly on the test_task_rows (what I also want), isn't it ?

I don't know why you combine train_set and test_set to task_test if you only predict on test_set anyway. If task_test consists only of test_set, you don't need to pass row ids in at$predict().