Assessing variable importance of REEMforest and MERF using the "iml" package

114 Views Asked by At

I recently started working with the LongituRF package. I'm fitting it to some data, and I'd like to assess the variable importance using the iml package. I have already used iml, and I like its properties. However, I'm not able to assess the varibale importance when I'm working with LongituRF.

In the following code, I created some data and fitted a REEMforest and a MERF from the LongituRF package to the data. Then I'm trying to assess the variable importance, but I get this error message:

Error in initialize(...) : Please call Predictor$new() with the y target vector.

So apparently, Predictor$new() is not correctly defined in my code.

At the end of my example code, I also fit a randomForest to the data and asses the variable importance. As you can see, it works just fine there.

Do you have any idea on how I can solve this?

Example code:

# libraries ---------------------------------------------------------------

install.packages("LongituRF")
# #(S)REEMforest is an adaptation of the random forest regression method to longitudinal data introduced by Capitaine et. al. (2020) <doi:10.1177/0962280220946080>
library(LongituRF)

install.packages("iml")
# for assessing variable importance
library(iml)

# -------------------------------------------------------------------------
# a function that creates some data for me

dgp_math_s <- function(ni,nj, RI_sd, sigma2 = 1,
                       gamma00 = 0, gamma01 = 0, gamma10 = 0, gamma02 = 0, gamma20 = 0){
  
  dgp_grid <- expand.grid(
    ni = 1:ni,
    nj = 1:nj,
    studying = NA,
    atmosphere = NA,
    motivation = NA,
    math_score = NA, 
    Rij = NA,
    U0j = NA
  )
  
  dgp_grid$atmosphere <- rep(rbinom(nj,1,0.5), each = length(1:ni))
  #create a random factorial level 2 predictor, same value for the whole cluster 
  
  dgp_grid$U0j <- rep(rnorm(nj, mean = 3, sd = RI_sd), each = ni)
  #create level 2 residual 
  
  dgp_grid$Rij <- rnorm(ni*nj, mean = 3, sd = sqrt(sigma2))
  # create level 1 residual with sigma2 = 1
  
  dgp_grid$studying <-sample(0:5, ni*nj, replace = TRUE)
  # create level 1 explanatory/predictor variable (draw from standard normal) 
  
  dgp_grid$motivation <-sample(0:5, ni*nj, replace = TRUE)
  # create level 1 explanatory/predictor variable (draw from standard normal) 
  
  dgp_grid$math_score <-
    gamma00 + gamma10 * dgp_grid$studying + gamma20 * dgp_grid$motivation + gamma01 * dgp_grid$atmosphere +
    dgp_grid$U0j + dgp_grid$Rij
  #create math_score
  
  return(dgp_grid)
}
# -------------------------------------------------------------------------

dgp_math<-dgp_math_s(ni = 20, nj = 20, RI_sd = 2, gamma10 = 0, gamma01 = 0)
#create data 


# Fitting REEMforest ------------------------------------------------------

predictors <- dgp_math[, c("studying", "atmosphere","motivation")]

outcome <- dgp_math$math_score
outcome <- as.vector(outcome)


SREEMF <- LongituRF::REEMforest(X=predictors,Y=dgp_math$math_score,Z=matrix(rep(1, nrow(dgp_math)), ncol = 1),
                                id=dgp_math$nj,time=dgp_math$ni,ntree=100,sto="none", mtry = 2)

#Fitting REEMforest


# Fitting MERF ------------------------------------------------------------

MERF <- LongituRF::MERF(X=predictors,Y=dgp_math$math_score,Z=matrix(rep(1, nrow(dgp_math)), ncol = 1),
                                id=dgp_math$nj,time=dgp_math$ni,ntree=100,sto="none", mtry = 2)

#Fitting MERF


# Assessing variable importance using "iml" -------------------------------


pred <- Predictor$new(SREEMF$forest, data = cbind(predictors, dgp_math$math_score))

imp <- iml::FeatureImp$new(pred, loss = "mse", compare = "difference")$results
# Variable importance of REEMforest




pred <- Predictor$new(MERF$forest, data = cbind(predictors, dgp_math$math_score))

imp <- iml::FeatureImp$new(pred, loss = "mse", compare = "difference")$results
# Variable importance of MERF



# example using CARTforest ------------------------------------------------
install.packages("randomForest")
library(randomForest)

mybreimanforest <- randomForest::randomForest(math_score ~ studying + motivation + atmosphere, data = dgp_math, ntree= 500)


## Variable importance using iml -------------------------------------------

pred_breimanforest <- Predictor$new(mybreimanforest, data = dgp_math)

imp_breimanforest <- FeatureImp$new(pred_breimanforest, loss = "mse", compare = "difference")$results
#this works for the randomforest

0

There are 0 best solutions below