Subscript out of Bounds in tidymodels tuning

404 Views Asked by At

When using the tidymodels dataset, I run into a "subscript out of bounds error". I am using the Ames house prices dataset from Kaggle by the way.

First, I create my recipe:

data.split = initial_split(house.prices, prop=0.8)

train.data = training(data.split)
test.data = testing(data.split)

# Create recipe - OLS
sales.rec = recipe(SalePrice ~., data = train.data) %>%
  step_log(SalePrice, LotArea, GrLivArea, TotRmsAbvGrd) %>%
  update_role(Id, SalePrice, new_role = "ID") %>%
  step_num2factor(MSSubClass, levels = as.character(unique(house.prices$SalePrice))) %>%
  step_unknown(PoolQC, Fence, MiscFeature, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1,
               FireplaceQu, GarageType, GarageQual, GarageCond, new_level = "None") %>%
  step_mutate(PorchArea = OpenPorchSF+EnclosedPorch+`3SsnPorch`+ScreenPorch) %>%
  step_mutate(garage.age = YrSold - GarageYrBlt,
         house.age = YrSold - YearBuilt,
         renovation.age = YrSold - YearRemodAdd
         ) %>%
  step_mutate(has.garage = (GarageType != NA),
         has.basement = (BsmtExposure != NA),
         has.pool = (PoolQC != NA),
         is.new = house.age==0
         ) %>%
  step_cut(OverallQual, breaks = c(2.5, 6.5, 8.5)) %>%
  step_cut(OverallCond, breaks = c(2.5, 6.5, 8.5)) %>%
  step_mutate(house.age = log(1+house.age)) %>%
  step_rm(-c(SalePrice, house.age, renovation.age, 
             has.garage, has.basement, has.pool, is.new,
             MSZoning, LotArea, Alley, LotShape, Utilities, Neighborhood, OverallQual, OverallCond, 
             ExterCond, Foundation,
             BsmtFinSF1, Heating, HeatingQC, CentralAir, GrLivArea, FullBath, KitchenQual, TotRmsAbvGrd,
             PavedDrive, PorchArea, MiscVal,SaleCondition)) %>% 
  step_other(all_nominal_predictors(), all_factor(), all_string()) %>%
  step_string2factor(all_string_predictors()) %>%
  step_nzv(all_predictors()) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_unknown(all_factor_predictors()) %>%
  step_normalize(all_numeric_predictors())

Then I set up a ranger random forest model:

my.rf = rand_forest(mtry = tune(), trees = 2000, min_n=tune()) %>%
  set_engine("ranger") %>%
  set_mode("regression")

Finally, I create a workflow and try to tune the model:

tree.grid = expand.grid(min_n=c(2,14,27,40), mtry = c(4, 8, 12))
folds = rsample::vfold_cv(train.data, v = 5)
metric = metric_set(rmse)


set_dependency("rand_forest", "ranger", "ranger", mode = "regression")
# Random forest
my.rf.rec = sales.rec %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors()) %>%
  step_other(all_nominal_predictors())
my.rf.wflow = workflow() %>%
  add_model(my.rf) %>%
  add_recipe(my.rf.rec)

my.rf.res = my.rf.wflow %>%
  tune_grid(
  resamples = folds,
  metrics = metric,
  grid = tree.grid
)

However, upon running that final lines, I get the following error: "x Fold1: preprocessor 1/1, model 1/12: Error in y.mat[, 2]: subscript out of bounds" (for lots of preprocessor and models), followed by "Warning: All models failed. Run show_notes(.Last.tune.result) for more information." (if I run the command indicated in the error, it just gives me "Error in y.mat[, 2]: subscript out of bounds")

I've tried googling everything, but cannot find the source of the error, any help would be appreciated :)

EDIT: Here's the head of the dataset, hopefully it helps?

structure(list(Id = c(1, 2, 3, 4, 5, 6), MSSubClass = c(60, 20, 
60, 70, 60, 50), MSZoning = c("RL", "RL", "RL", "RL", "RL", "RL"
), LotFrontage = c(65, 80, 68, 60, 84, 85), LotArea = c(8450, 
9600, 11250, 9550, 14260, 14115), Street = c("Pave", "Pave", 
"Pave", "Pave", "Pave", "Pave"), Alley = c(NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_), 
    LotShape = c("Reg", "Reg", "IR1", "IR1", "IR1", "IR1"), LandContour = c("Lvl", 
    "Lvl", "Lvl", "Lvl", "Lvl", "Lvl"), Utilities = c("AllPub", 
    "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"), LotConfig = c("Inside", 
    "FR2", "Inside", "Corner", "FR2", "Inside"), LandSlope = c("Gtl", 
    "Gtl", "Gtl", "Gtl", "Gtl", "Gtl"), Neighborhood = c("CollgCr", 
    "Veenker", "CollgCr", "Crawfor", "NoRidge", "Mitchel"), Condition1 = c("Norm", 
    "Feedr", "Norm", "Norm", "Norm", "Norm"), Condition2 = c("Norm", 
    "Norm", "Norm", "Norm", "Norm", "Norm"), BldgType = c("1Fam", 
    "1Fam", "1Fam", "1Fam", "1Fam", "1Fam"), HouseStyle = c("2Story", 
    "1Story", "2Story", "2Story", "2Story", "1.5Fin"), OverallQual = c(7, 
    6, 7, 7, 8, 5), OverallCond = c(5, 8, 5, 5, 5, 5), YearBuilt = c(2003, 
    1976, 2001, 1915, 2000, 1993), YearRemodAdd = c(2003, 1976, 
    2002, 1970, 2000, 1995), RoofStyle = c("Gable", "Gable", 
    "Gable", "Gable", "Gable", "Gable"), RoofMatl = c("CompShg", 
    "CompShg", "CompShg", "CompShg", "CompShg", "CompShg"), Exterior1st = c("VinylSd", 
    "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "VinylSd"), Exterior2nd = c("VinylSd", 
    "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "VinylSd"), MasVnrType = c("BrkFace", 
    "None", "BrkFace", "None", "BrkFace", "None"), MasVnrArea = c(196, 
    0, 162, 0, 350, 0), ExterQual = c("Gd", "TA", "Gd", "TA", 
    "Gd", "TA"), ExterCond = c("TA", "TA", "TA", "TA", "TA", 
    "TA"), Foundation = c("PConc", "CBlock", "PConc", "BrkTil", 
    "PConc", "Wood"), BsmtQual = c("Gd", "Gd", "Gd", "TA", "Gd", 
    "Gd"), BsmtCond = c("TA", "TA", "TA", "Gd", "TA", "TA"), 
    BsmtExposure = c("No", "Gd", "Mn", "No", "Av", "No"), BsmtFinType1 = c("GLQ", 
    "ALQ", "GLQ", "ALQ", "GLQ", "GLQ"), BsmtFinSF1 = c(706, 978, 
    486, 216, 655, 732), BsmtFinType2 = c("Unf", "Unf", "Unf", 
    "Unf", "Unf", "Unf"), BsmtFinSF2 = c(0, 0, 0, 0, 0, 0), BsmtUnfSF = c(150, 
    284, 434, 540, 490, 64), TotalBsmtSF = c(856, 1262, 920, 
    756, 1145, 796), Heating = c("GasA", "GasA", "GasA", "GasA", 
    "GasA", "GasA"), HeatingQC = c("Ex", "Ex", "Ex", "Gd", "Ex", 
    "Ex"), CentralAir = c("Y", "Y", "Y", "Y", "Y", "Y"), Electrical = c("SBrkr", 
    "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr"), `1stFlrSF` = c(856, 
    1262, 920, 961, 1145, 796), `2ndFlrSF` = c(854, 0, 866, 756, 
    1053, 566), LowQualFinSF = c(0, 0, 0, 0, 0, 0), GrLivArea = c(1710, 
    1262, 1786, 1717, 2198, 1362), BsmtFullBath = c(1, 0, 1, 
    1, 1, 1), BsmtHalfBath = c(0, 1, 0, 0, 0, 0), FullBath = c(2, 
    2, 2, 1, 2, 1), HalfBath = c(1, 0, 1, 0, 1, 1), BedroomAbvGr = c(3, 
    3, 3, 3, 4, 1), KitchenAbvGr = c(1, 1, 1, 1, 1, 1), KitchenQual = c("Gd", 
    "TA", "Gd", "Gd", "Gd", "TA"), TotRmsAbvGrd = c(8, 6, 6, 
    7, 9, 5), Functional = c("Typ", "Typ", "Typ", "Typ", "Typ", 
    "Typ"), Fireplaces = c(0, 1, 1, 1, 1, 0), FireplaceQu = c(NA, 
    "TA", "TA", "Gd", "TA", NA), GarageType = c("Attchd", "Attchd", 
    "Attchd", "Detchd", "Attchd", "Attchd"), GarageYrBlt = c(2003, 
    1976, 2001, 1998, 2000, 1993), GarageFinish = c("RFn", "RFn", 
"RFn", "Unf", "RFn", "Unf"), GarageCars = c(2, 2, 2, 3, 3, 
    2), GarageArea = c(548, 460, 608, 642, 836, 480), GarageQual = c("TA", 
    "TA", "TA", "TA", "TA", "TA"), GarageCond = c("TA", "TA", 
    "TA", "TA", "TA", "TA"), PavedDrive = c("Y", "Y", "Y", "Y", 
    "Y", "Y"), WoodDeckSF = c(0, 298, 0, 0, 192, 40), OpenPorchSF = c(61, 
    0, 42, 35, 84, 30), EnclosedPorch = c(0, 0, 0, 272, 0, 0), 
    `3SsnPorch` = c(0, 0, 0, 0, 0, 320), ScreenPorch = c(0, 0, 
    0, 0, 0, 0), PoolArea = c(0, 0, 0, 0, 0, 0), PoolQC = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), Fence = c(NA, NA, NA, NA, NA, "MnPrv"), MiscFeature = c(NA, 
    NA, NA, NA, NA, "Shed"), MiscVal = c(0, 0, 0, 0, 0, 700), 
    MoSold = c(2, 5, 9, 2, 12, 10), YrSold = c(2008, 2007, 2008, 
    2006, 2008, 2009), SaleType = c("WD", "WD", "WD", "WD", "WD", 
    "WD"), SaleCondition = c("Normal", "Normal", "Normal", "Abnorml", 
    "Normal", "Normal"), SalePrice = c(208500, 181500, 223500, 
    140000, 250000, 143000)), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"))
0

There are 0 best solutions below