Recursive feature elimination (rfe) with ranger random forest in R

150 Views Asked by At

I'm trying to run recursive feature elimination with a random forest from the ranger package with permutation importance because I have a large dataset with a lot of correlated variables. I'm struggling to set up the control function (or at least I'm pretty sure that's the problem), I've create a reproducible example below.

I've done some research and found a good general guide here, Github RFE Guide, a stackoverflow solution for the party cforest function here, Stackoverflow RFE with Party, and a thread that discusses using ranger with RFE but I honestly can't follow it: Github Ranger discussion. I've been trying to cobble these together to figure out what I need.

This is what I have so far, can anyone offer some insight? Thanks in advance!

##########
# Set Up #
##########
# Load the iris data set
library(datasets)
data(iris)

# Length of iris dataset
n = nrow(iris)

# Add some useless variables
# Correlated with another predictor
iris$Petal.Length2 = 2.5 * iris$Petal.Length + rnorm(n, 0, 1)
iris$Petal.Length3 =iris$Petal.Length - rnorm(n, 0, 2)
iris$Sepal.Length2 = 2.5 * iris$Petal.Length + rnorm(n, 0, 1)
# Random variables
iris$Rand1 = rnorm(n, 0, 1)
iris$Rand2 = rnorm(n, 1, 2)

# Create training and test sets
library(caret)
train_index = createDataPartition(iris$Species, p = 0.80, list = FALSE)[,1]
train = iris[train_index,]
test = iris[-train_index,]

# Subset out the predictors and response
train_pred = train %>% select(-Species)
train_resp = train %>% select(Species)

library(ranger)
# Fit a ranger RF
ranger1 = ranger(factor(Species) ~ .,
                 data = train,
                 num.trees = 100,
                 mtry = 3,
                 oob.error = TRUE,
                 seed = 1,
                 importance = "permutation")

# Try to define the functions needed to implement a ranger model with permutation importance
# as above with RFE
ranger_perm <-  list(summary = defaultSummary, # summary for forest model
                     fit = function(x, y, first, last, ...) {
                       loadNamespace("ranger")
                       data = data.frame(cbind(x, y))
                       model1 <- ranger::ranger(y ~ .,
                                      data = data,
                                      oob.error = TRUE,
                                      seed = 1,
                                      importance = "permutation") # output is forest model
                       model1
                     },
                     pred = function(object, x)  { #predict new variables
                       tmp <- predict(object, data = x)
                       tmp
                     },
                     rank = function(object, x, y) { # extract variable importance from model (rows are variables, columns classes)
                       vimp <- data.frame(Overall = object$variable.importance)
                       if(is.factor(y)) {
                         if(all(levels(y) %in% colnames(vimp))) {
                           avImp <- apply(vimp[, levels(y), drop = TRUE], 1, mean)
                           vimp$Overall <- avImp
                         }
                       }
                       vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
                       if (ncol(x) == 1) {
                         vimp$var <- colnames(x)
                       } else vimp$var <- rownames(vimp)
                       vimp
                     },
                     selectSize <- pickSizeBest,
                     selectVar <- pickVars
                     )
# Control function
control = rfeControl(functions = ranger_perm, # ranger model with permutation importance
                     method = "repeatedcv", # repeated cross-validation
                     repeats = 5, # times to repeat cross validation procedure
                     number = 2, # number of folds,
                     allowParallel = TRUE) # allow parallelization

rfe1 = rfe(x = train_pred, 
           y = as.factor(unlist(train_resp)), 
           sizes = c(1:9), 
           rfeControl = control)

I get the error: "Error in { : task 1 failed - "undefined columns selected""

1

There are 1 best solutions below

0
Jade131621 On

It looks like I solved my own problem. My issue was that when using the predict function with a ranger model, the output is a list instead of a vector like it is in randomForest.

ranger_perm <-  list(
  summary = defaultSummary,
  fit = function(x, y, first, last, ...) {
    loadNamespace("ranger")
    data = data.frame(cbind(x, y))
    ranger::ranger(y ~ .,
                 data = data,
                 oob.error = TRUE,
                 seed = 1,
                 importance = "permutation")
    },
  pred = function(object, x)  { 
    predict(object, data = x)$predictions
    },
  rank = function(object, x, y) {
    vimp <- data.frame(Overall = object$variable.importance)
    if(is.factor(y)) {
      if(all(levels(y) %in% colnames(vimp))) {
        avImp <- apply(vimp[, levels(y), drop = TRUE], 1, mean)
        vimp$Overall <- avImp
      }
      }
    vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
    if (ncol(x) == 1) {
      vimp$var <- colnames(x)
      } else vimp$var <- rownames(vimp)
    vimp
    },
  selectSize = pickSizeBest,
  selectVar = pickVars
  )

# Control function
control = rfeControl(functions = ranger_perm, # ranger model with permutation importance
                     method = "repeatedcv", # repeated cross-validation
                     repeats = 5, # times to repeat cross validation procedure
                     number = 2, # number of folds,
                     allowParallel = TRUE) # allow parallelization

rfe1 = rfe(x = train_pred, 
           y = as.factor(unlist(train_resp)), 
           sizes = c(1:9), 
           rfeControl = control,
           metric = "Accuracy",
           maximize = "TRUE")