I have the sample code of random forest variable selection. We want to choose the combination of variables with most importance and build the random forest model with the lowest OOB. Can anyone explain the for loop part in the function for me?
clinical_variables <- c("Age","location", "smoke", "perianal_disease","upper_tract", "LnASCA
IgA","LnASCA IgG", "LnANCA", "LnCbir", "LnOMPC", "CRP", "Albumin", "African American Race")
variable_selected_progress_biomarkers <- vector("list", 50)
error_rate_min_progress_biomarkers <- rep(NA, 50)
for (j in 1:50){
risk_progress_biomarker_variables <- risk_full %>%
select(names(risk), clinical_variables) %>%
select(-c("STRICTURE", "TIM2STRICTURE", "PENETRATING", "TIM2PENETRATING","BDNF","LASTFOLLOWUPDAYSPROGRESS", "PROGRESSED")) %>% names
risk_progress_biomarker_variables_total <- vector("list",104)
names(risk_progress_biomarker_variables_total) <- 104:1
error_rate_tail_progress_biomarker <- rep(NA, 104)
for (i in 1:104){
set.seed(4182019)
risk_progress_biomarker_variables_total[[i]] <- risk_progress_biomarker_variables
rf_risk_progress_biomarker <- rfsrc(
Surv(LASTFOLLOWUPDAYSPROGRESS, PROGRESSED) ~ .,
data = risk_full %>% select(risk_progress_biomarker_variables, LASTFOLLOWUPDAYSPROGRESS, PROGRESSED)%>%
mutate_if(is.factor, as.numeric),
ntree=1000,
importance = TRUE
)
error_rate_tail_progress_biomarker[i] <- tail(rf_risk_progress_biomarker$err.rate,n =1)
rf_risk_progress_biomarker_importance <- rf_risk_progress_biomarker$importance %>%
as.data.frame() %>%
rownames_to_column() %>%
as.tibble() %>%
dplyr::rename(VIMP = ".") %>%
arrange(desc(VIMP))
risk_progress_biomarker_variables <- rf_risk_progress_biomarker_importance %>%
head((dim(rf_risk_progress_biomarker_importance)[1]-1)) %>%
# top_n((dim(rf_risk_progress_biomarker_importance)[1]-1)) %>%
pull(rowname)
print(i)
}
tibble_error_rate_tail_progress_biomarker <- tibble(n = 104:1, error_rate = error_rate_tail_progress_biomarker)
suppressMessages(n_min_progress_biomarker <- tibble_error_rate_tail_progress_biomarker %>% top_n(-1) %>% pull(n))
suppressMessages(error_rate_min_progress_biomarker <- tibble_error_rate_tail_progress_biomarker %>% top_n(-1) %>% pull(error_rate))
variable_selected_progress_biomarkers[[j]] <- str_replace_all(risk_progress_biomarker_variables_total[[105-n_min_progress_biomarker]], "_", "")
error_rate_min_progress_biomarkers[j] <- error_rate_min_progress_biomarker
print(paste("Finish", j))
}