Extract residuals from an XGBoost regression as a single raster

192 Views Asked by At

In R, using the caret and xgboost packages and this tutorial, I am running an XGBoost regression (XGBR) and I want to extract the residuals of the XGBR. I hyper-tuned the model using the caret package and then, using the 'best' model parameters I used the xgboost package to perform the regression.

My dataset has the ntl, pop, tirs, agbh variables stored in data.frame (ntl is the dependent variable while the other three are the independent). Assuming that my XGBR model is called m and my data.frame is called block.data, I did:

library(caret)
library(terra)
library(xgboost)
library(doParallel)
library(dplyr)
library(ggplot2)
library(glue)
library(ModelMetrics)
library(readr)

wd = "path/"

block.data = read.csv(paste0(wd, "block.data.csv"))

block.data = subset(block.data, select = c(ntl, tirs, pop, agbh))

set.seed(1123)

samp <- sample(nrow(block.data), 0.80 * nrow(block.data))

train <- block.data[samp, ]
train_x <- as.matrix(select(train, -ntl))
train_y <- train$ntl

test <- block.data[-samp, ]
test_x <- select(test, -ntl)
test_y <- test$ntl

no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)

# default model
grid_default <- expand.grid(
  nrounds = 100,
  max_depth = 6,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

train_control <- caret::trainControl(
  method = "none",
  verboseIter = FALSE, # no training log
  allowParallel = TRUE # FALSE for reproducible results 
)

xgb_base <- caret::train(
  x = train_x,
  y = train_y,
  trControl = train_control,
  tuneGrid = grid_default,
  method = "xgbTree",
  verbose = TRUE
)

# hyperparameter tuning
# setting up the maximum number of trees
nrounds <- 1000

# note to start nrounds from 200, as smaller learning rates result in errors so
# big with lower starting points that they'll mess the scales
tune_grid <- expand.grid(
  nrounds = seq(from = 200, to = nrounds, by = 50),
  eta = c(0.025, 0.05, 0.1, 0.3),
  max_depth = c(2, 3, 4, 5, 6),
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

tune_control <- caret::trainControl(
  method = "cv", # cross-validation
  number = 3, # with n folds 
  #index = createFolds(tr_treated$Id_clean), # fix the folds
  verboseIter = FALSE, # no training log
  allowParallel = TRUE # FALSE for reproducible results 
)

xgb_tune <- caret::train(
  x = train_x,
  y = train_y,
  trControl = tune_control,
  tuneGrid = tune_grid,
  method = "xgbTree",
  verbose = TRUE
)

tuneplot <- function(x, probs = .90) {
  ggplot(x) +
    coord_cartesian(ylim = c(quantile(x$results$RMSE, probs = probs), min(x$results$RMSE))) +
    theme_bw()
}

tuneplot(xgb_tune)

xgb_tune$bestTune

# find maximum depth
tune_grid2 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = ifelse(xgb_tune$bestTune$max_depth == 2,
                     c(xgb_tune$bestTune$max_depth:4),
                     xgb_tune$bestTune$max_depth - 1:xgb_tune$bestTune$max_depth + 1),
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = c(1, 2, 3),
  subsample = 1
)

xgb_tune2 <- caret::train(
  x = train_x,
  y = train_y,
  trControl = tune_control,
  tuneGrid = tune_grid2,
  method = "xgbTree",
  verbose = TRUE
)

tuneplot(xgb_tune2)

xgb_tune2$bestTune

# different values for row and column sampling
tune_grid3 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = 0,
  colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = c(0.5, 0.75, 1.0)
)

xgb_tune3 <- caret::train(
  x = train_x,
  y = train_y,
  trControl = tune_control,
  tuneGrid = tune_grid3,
  method = "xgbTree",
  verbose = TRUE
)

tuneplot(xgb_tune3, probs = .95)

xgb_tune3$bestTune

set.seed(57)
omp_set_num_threads(2) # caret parallel processing threads

# gamma
tune_grid4 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
  colsample_bytree = xgb_tune3$bestTune$colsample_bytree,
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = xgb_tune3$bestTune$subsample
)

xgb_tune4 <- caret::train(
  x = train_x,
  y = train_y,
  trControl = tune_control,
  tuneGrid = tune_grid4,
  method = "xgbTree",
  verbose = TRUE
)

tuneplot(xgb_tune4)

xgb_tune4$bestTune

# Reducing the Learning Rate
tune_grid5 <- expand.grid(
  nrounds = seq(from = 100, to = 10000, by = 100),
  eta = c(0.01, 0.015, 0.025, 0.05, 0.1),
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = xgb_tune4$bestTune$gamma,
  colsample_bytree = xgb_tune3$bestTune$colsample_bytree,
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = xgb_tune3$bestTune$subsample
)

xgb_tune5 <- caret::train(
  x = train_x,
  y = train_y,
  trControl = tune_control,
  tuneGrid = tune_grid5,
  method = "xgbTree",
  verbose = TRUE
)

tuneplot(xgb_tune5)

xgb_tune5$bestTune

# Fitting the Model
(final_grid <- expand.grid(
  nrounds = xgb_tune5$bestTune$nrounds,
  eta = xgb_tune5$bestTune$eta,
  max_depth = xgb_tune5$bestTune$max_depth,
  gamma = xgb_tune5$bestTune$gamma,
  colsample_bytree = xgb_tune5$bestTune$colsample_bytree,
  min_child_weight = xgb_tune5$bestTune$min_child_weight,
  subsample = xgb_tune5$bestTune$subsample
))

(xgb_model <- caret::train(
  x = train_x,
  y = train_y,
  trControl = train_control,
  tuneGrid = final_grid,
  method = "xgbTree",
  verbose = TRUE
))

stopCluster(cl)

# apply model to the whole data set using xgboost
xgb_m <- xgb.DMatrix(data = data.matrix(block.data), label = block.data$ntl)

m = xgb.train(data = xgb_m, 
            max.depth = xgb_tune5$bestTune$max_depth, 
            # watchlist = watchlist, 
            nrounds = xgb_tune5$bestTune$nrounds, 
            min_child_weight = xgb_tune5$bestTune$min_child_weight, 
            subsample = xgb_tune5$bestTune$subsample, 
            eta = xgb_tune5$bestTune$eta, 
            gamma = xgb_tune5$bestTune$gamma,
            colsample_bytree = xgb_tune5$bestTune$colsample_bytree, 
            objective = "reg:squarederror")

m

# export xgb residuals
xgb_resids = predict(m, xgb_m, na.rm = TRUE)

sb = c(ntl, pop_res, tirs_res, agbh_res)

xgb_resids = sb$ntl - xgb_resids
plot(xgb_resids)

The plot looks like this:

residuals

Obviously, I am doing something very wrong. How can I export the residuals of an XGBR as a single raster?

Here is a very small sample of my dataset:

block.data = structure(list(x = c(11880750L, 11879250L, 11879750L, 11880250L, 
11880750L, 11881250L, 11879250L, 11879750L, 11880250L, 11880750L, 
11881250L, 11878750L, 11879250L, 11879750L, 11880250L, 11880750L, 
11881250L, 11879250L, 11879750L, 11880250L, 11880750L, 11881250L, 
11881750L, 11882250L, 11879250L, 11879750L, 11880250L, 11880750L, 
11881250L, 11881750L, 11882250L, 11882750L, 11879250L, 11879750L
), y = c(1802250L, 1801750L, 1801750L, 1801750L, 1801750L, 1801750L, 
1801250L, 1801250L, 1801250L, 1801250L, 1801250L, 1800750L, 1800750L, 
1800750L, 1800750L, 1800750L, 1800750L, 1800250L, 1800250L, 1800250L, 
1800250L, 1800250L, 1800250L, 1800250L, 1799750L, 1799750L, 1799750L, 
1799750L, 1799750L, 1799750L, 1799750L, 1799750L, 1799250L, 1799250L
), ntl = c(18.7969169616699, 25.7222957611084, 23.4188251495361, 
25.4322757720947, 16.4593601226807, 12.7868213653564, 30.9337253570557, 
29.865758895874, 30.4080600738525, 29.5479888916016, 24.3493347167969, 
35.2427635192871, 38.989933013916, 34.6536979675293, 29.4607238769531, 
30.7469024658203, 34.3946380615234, 42.8660278320312, 34.7930717468262, 
30.9516315460205, 32.20654296875, 39.999755859375, 46.6002235412598, 
38.6480979919434, 60.5214920043945, 33.1799964904785, 31.8498134613037, 
30.9209423065186, 32.2269744873047, 53.7062034606934, 45.5225944519043, 
38.3570976257324, 123.040382385254, 73.0528182983398), pop = c(19.6407718658447, 
610.009216308594, 654.812622070312, 426.475830078125, 66.3839492797852, 
10.6471328735352, 443.848846435547, 602.677429199219, 488.478454589844, 
387.470947265625, 58.2341117858887, 413.888488769531, 315.057678222656, 
354.082946777344, 602.827758789062, 463.518829345703, 296.713928222656, 
923.920593261719, 434.436645507812, 799.562927246094, 404.709564208984, 
265.043304443359, 366.697235107422, 399.851684570312, 952.2314453125, 
870.356994628906, 673.406616210938, 493.521606445312, 273.841888427734, 
371.428619384766, 383.057830810547, 320.986755371094, 991.131225585938, 
1148.87768554688), tirs = c(39.7242431640625, 44.9583969116211, 
41.4048385620117, 42.6056709289551, 40.0976028442383, 38.7490005493164, 
44.2747650146484, 43.5645370483398, 41.6180191040039, 40.3799781799316, 
38.8664817810059, 44.9089202880859, 44.414306640625, 44.560977935791, 
43.1288986206055, 40.9315185546875, 38.8918418884277, 46.3063850402832, 
45.5805702209473, 44.9196586608887, 42.2495613098145, 39.3051452636719, 
38.7914810180664, 38.6069412231445, 44.6782455444336, 46.4024772644043, 
44.4720573425293, 41.7361183166504, 42.3378067016602, 41.0018348693848, 
39.3579216003418, 41.6303863525391, 43.8207550048828, 46.0460357666016
), agbh = c(3.32185006141663, 4.98925733566284, 4.35699367523193, 
4.94798421859741, 3.14325952529907, 2.93211793899536, 4.52736520767212, 
4.99723243713379, 5.13944292068481, 3.92965626716614, 3.43465113639832, 
3.55617475509644, 3.4659411907196, 5.24469566345215, 5.36995029449463, 
4.61549234390259, 4.82002925872803, 4.20452928543091, 4.71502685546875, 
5.20452785491943, 5.05676746368408, 5.9952244758606, 6.16778612136841, 
4.69053316116333, 2.62325501441956, 4.74775457382202, 4.93133020401001, 
5.02366256713867, 5.74016952514648, 6.28353786468506, 4.67424774169922, 
4.56812858581543, 1.88153350353241, 4.31531000137329)), class = "data.frame", row.names = c(NA, 
-34L))

My raster layer:

r = new("RasterBrick", file = new(".RasterFile", name = "", datanotation = "FLT4S", 
    byteorder = "little", nodatavalue = -Inf, NAchanged = FALSE, 
    nbands = 1L, bandorder = "BIL", offset = 0L, toptobottom = TRUE, 
    blockrows = 0L, blockcols = 0L, driver = "", open = FALSE), 
    data = new(".MultipleRasterData", values = structure(c(NA, 
    NA, NA, NA, 18.7969169616699, NA, NA, NA, NA, NA, 25.7222957611084, 
    23.4188251495361, 25.4322757720947, 16.4593601226807, 12.7868213653564, 
    NA, NA, NA, NA, 30.9337253570557, 29.865758895874, 30.4080600738525, 
    29.5479888916016, 24.3493347167969, NA, NA, NA, 35.2427635192871, 
    38.989933013916, 34.6536979675293, 29.4607238769531, 30.7469024658203, 
    34.3946380615234, NA, NA, NA, NA, 42.8660278320312, 34.7930717468262, 
    30.9516315460205, 32.20654296875, 39.999755859375, 46.6002235412598, 
    38.6480979919434, NA, NA, 60.5214920043945, 33.1799964904785, 
    31.8498134613037, 30.9209423065186, 32.2269744873047, 53.7062034606934, 
    45.5225944519043, 38.3570976257324, NA, 123.040382385254, 
    73.0528182983398, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    19.6407718658447, NA, NA, NA, NA, NA, 610.009216308594, 654.812622070312, 
    426.475830078125, 66.3839492797852, 10.6471328735352, NA, 
    NA, NA, NA, 443.848846435547, 602.677429199219, 488.478454589844, 
    387.470947265625, 58.2341117858887, NA, NA, NA, 413.888488769531, 
    315.057678222656, 354.082946777344, 602.827758789062, 463.518829345703, 
    296.713928222656, NA, NA, NA, NA, 923.920593261719, 434.436645507812, 
    799.562927246094, 404.709564208984, 265.043304443359, 366.697235107422, 
    399.851684570312, NA, NA, 952.2314453125, 870.356994628906, 
    673.406616210938, 493.521606445312, 273.841888427734, 371.428619384766, 
    383.057830810547, 320.986755371094, NA, 991.131225585938, 
    1148.87768554688, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    39.7242431640625, NA, NA, NA, NA, NA, 44.9583969116211, 41.4048385620117, 
    42.6056709289551, 40.0976028442383, 38.7490005493164, NA, 
    NA, NA, NA, 44.2747650146484, 43.5645370483398, 41.6180191040039, 
    40.3799781799316, 38.8664817810059, NA, NA, NA, 44.9089202880859, 
    44.414306640625, 44.560977935791, 43.1288986206055, 40.9315185546875, 
    38.8918418884277, NA, NA, NA, NA, 46.3063850402832, 45.5805702209473, 
    44.9196586608887, 42.2495613098145, 39.3051452636719, 38.7914810180664, 
    38.6069412231445, NA, NA, 44.6782455444336, 46.4024772644043, 
    44.4720573425293, 41.7361183166504, 42.3378067016602, 41.0018348693848, 
    39.3579216003418, 41.6303863525391, NA, 43.8207550048828, 
    46.0460357666016, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    3.32185006141663, NA, NA, NA, NA, NA, 4.98925733566284, 4.35699367523193, 
    4.94798421859741, 3.14325952529907, 2.93211793899536, NA, 
    NA, NA, NA, 4.52736520767212, 4.99723243713379, 5.13944292068481, 
    3.92965626716614, 3.43465113639832, NA, NA, NA, 3.55617475509644, 
    3.4659411907196, 5.24469566345215, 5.36995029449463, 4.61549234390259, 
    4.82002925872803, NA, NA, NA, NA, 4.20452928543091, 4.71502685546875, 
    5.20452785491943, 5.05676746368408, 5.9952244758606, 6.16778612136841, 
    4.69053316116333, NA, NA, 2.62325501441956, 4.74775457382202, 
    4.93133020401001, 5.02366256713867, 5.74016952514648, 6.28353786468506, 
    4.67424774169922, 4.56812858581543, NA, 1.88153350353241, 
    4.31531000137329, NA, NA, NA, NA, NA, NA), .Dim = c(63L, 
    4L)), offset = 0, gain = 1, inmemory = TRUE, fromdisk = FALSE, 
        nlayers = 4L, dropped = NULL, isfactor = c(FALSE, FALSE, 
        FALSE, FALSE), attributes = list(), haveminmax = TRUE, 
        min = c(12.7868213653564, 10.6471328735352, 38.6069412231445, 
        1.88153350353241), max = c(123.040382385254, 1148.87768554688, 
        46.4024772644043, 6.28353786468506), unit = "", names = c("ntl", 
        "pop", "tirs", "agbh")), legend = new(".RasterLegend", 
        type = character(0), values = logical(0), color = logical(0), 
        names = logical(0), colortable = logical(0)), title = character(0), 
    extent = new("Extent", xmin = 11878500, xmax = 11883000, 
        ymin = 1799000, ymax = 1802500), rotated = FALSE, rotation = new(".Rotation", 
        geotrans = numeric(0), transfun = function () 
        NULL), ncols = 9L, nrows = 7L, crs = new("CRS", projargs = NA_character_), 
    srs = character(0), history = list(), z = list())
1

There are 1 best solutions below

0
On

I found this solution:

d <- tibble(pred = predict(f, newdata = xs)
, obs = mtcars$mpg) %>%
mutate(resid = pred - obs,
resid_sq = resid^2)

and then I cbind the d with the coordinates from my raster data and I create the residuals raster.