R Explaining Random Forest Variable Selection Sample Code

119 Views Asked by At

I have the sample code of random forest variable selection. We want to choose the combination of variables with most importance and build the random forest model with the lowest OOB. Can anyone explain the for loop part in the function for me?

 clinical_variables <- c("Age","location", "smoke", "perianal_disease","upper_tract", "LnASCA 
 IgA","LnASCA IgG", "LnANCA", "LnCbir", "LnOMPC", "CRP", "Albumin", "African American Race")

 variable_selected_progress_biomarkers <- vector("list", 50)

 error_rate_min_progress_biomarkers <- rep(NA, 50)

 for (j in 1:50){

  risk_progress_biomarker_variables <- risk_full %>% 
    select(names(risk), clinical_variables) %>% 
    select(-c("STRICTURE", "TIM2STRICTURE", "PENETRATING",     "TIM2PENETRATING","BDNF","LASTFOLLOWUPDAYSPROGRESS", "PROGRESSED")) %>% names

  risk_progress_biomarker_variables_total <- vector("list",104)
  names(risk_progress_biomarker_variables_total) <- 104:1
  error_rate_tail_progress_biomarker <- rep(NA, 104)
  for (i in 1:104){
    set.seed(4182019)
    risk_progress_biomarker_variables_total[[i]] <-  risk_progress_biomarker_variables
    rf_risk_progress_biomarker <- rfsrc(
      Surv(LASTFOLLOWUPDAYSPROGRESS, PROGRESSED) ~ .,
      data = risk_full %>% select(risk_progress_biomarker_variables, LASTFOLLOWUPDAYSPROGRESS, PROGRESSED)%>% 
        mutate_if(is.factor, as.numeric),
      ntree=1000,
      importance = TRUE
    )
    error_rate_tail_progress_biomarker[i] <- tail(rf_risk_progress_biomarker$err.rate,n =1)

    rf_risk_progress_biomarker_importance <- rf_risk_progress_biomarker$importance %>% 
      as.data.frame() %>% 
      rownames_to_column() %>% 
      as.tibble() %>% 
      dplyr::rename(VIMP = ".") %>% 
      arrange(desc(VIMP)) 
    risk_progress_biomarker_variables <- rf_risk_progress_biomarker_importance %>% 
      head((dim(rf_risk_progress_biomarker_importance)[1]-1)) %>% 
      # top_n((dim(rf_risk_progress_biomarker_importance)[1]-1)) %>% 
      pull(rowname)
    print(i)
  }

  tibble_error_rate_tail_progress_biomarker <- tibble(n = 104:1, error_rate = error_rate_tail_progress_biomarker)
  suppressMessages(n_min_progress_biomarker <- tibble_error_rate_tail_progress_biomarker %>%  top_n(-1) %>% pull(n))
  suppressMessages(error_rate_min_progress_biomarker <- tibble_error_rate_tail_progress_biomarker %>%  top_n(-1) %>% pull(error_rate))

  variable_selected_progress_biomarkers[[j]] <- str_replace_all(risk_progress_biomarker_variables_total[[105-n_min_progress_biomarker]], "_", "")
  error_rate_min_progress_biomarkers[j] <- error_rate_min_progress_biomarker
  print(paste("Finish", j))
}
0

There are 0 best solutions below