In R
, using the caret
and xgboost
packages and this tutorial, I am running an XGBoost regression (XGBR) and I want to extract the residuals of the XGBR. I hyper-tuned the model using the caret
package and then, using the 'best' model parameters I used the xgboost
package to perform the regression.
My dataset has the ntl, pop, tirs, agbh variables stored in data.frame
(ntl is the dependent variable while the other three are the independent). Assuming that my XGBR model is called m and my data.frame
is called block.data, I did:
library(caret)
library(terra)
library(xgboost)
library(doParallel)
library(dplyr)
library(ggplot2)
library(glue)
library(ModelMetrics)
library(readr)
wd = "path/"
block.data = read.csv(paste0(wd, "block.data.csv"))
block.data = subset(block.data, select = c(ntl, tirs, pop, agbh))
set.seed(1123)
samp <- sample(nrow(block.data), 0.80 * nrow(block.data))
train <- block.data[samp, ]
train_x <- as.matrix(select(train, -ntl))
train_y <- train$ntl
test <- block.data[-samp, ]
test_x <- select(test, -ntl)
test_y <- test$ntl
no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)
# default model
grid_default <- expand.grid(
nrounds = 100,
max_depth = 6,
eta = 0.3,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
train_control <- caret::trainControl(
method = "none",
verboseIter = FALSE, # no training log
allowParallel = TRUE # FALSE for reproducible results
)
xgb_base <- caret::train(
x = train_x,
y = train_y,
trControl = train_control,
tuneGrid = grid_default,
method = "xgbTree",
verbose = TRUE
)
# hyperparameter tuning
# setting up the maximum number of trees
nrounds <- 1000
# note to start nrounds from 200, as smaller learning rates result in errors so
# big with lower starting points that they'll mess the scales
tune_grid <- expand.grid(
nrounds = seq(from = 200, to = nrounds, by = 50),
eta = c(0.025, 0.05, 0.1, 0.3),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
tune_control <- caret::trainControl(
method = "cv", # cross-validation
number = 3, # with n folds
#index = createFolds(tr_treated$Id_clean), # fix the folds
verboseIter = FALSE, # no training log
allowParallel = TRUE # FALSE for reproducible results
)
xgb_tune <- caret::train(
x = train_x,
y = train_y,
trControl = tune_control,
tuneGrid = tune_grid,
method = "xgbTree",
verbose = TRUE
)
tuneplot <- function(x, probs = .90) {
ggplot(x) +
coord_cartesian(ylim = c(quantile(x$results$RMSE, probs = probs), min(x$results$RMSE))) +
theme_bw()
}
tuneplot(xgb_tune)
xgb_tune$bestTune
# find maximum depth
tune_grid2 <- expand.grid(
nrounds = seq(from = 50, to = nrounds, by = 50),
eta = xgb_tune$bestTune$eta,
max_depth = ifelse(xgb_tune$bestTune$max_depth == 2,
c(xgb_tune$bestTune$max_depth:4),
xgb_tune$bestTune$max_depth - 1:xgb_tune$bestTune$max_depth + 1),
gamma = 0,
colsample_bytree = 1,
min_child_weight = c(1, 2, 3),
subsample = 1
)
xgb_tune2 <- caret::train(
x = train_x,
y = train_y,
trControl = tune_control,
tuneGrid = tune_grid2,
method = "xgbTree",
verbose = TRUE
)
tuneplot(xgb_tune2)
xgb_tune2$bestTune
# different values for row and column sampling
tune_grid3 <- expand.grid(
nrounds = seq(from = 50, to = nrounds, by = 50),
eta = xgb_tune$bestTune$eta,
max_depth = xgb_tune2$bestTune$max_depth,
gamma = 0,
colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
min_child_weight = xgb_tune2$bestTune$min_child_weight,
subsample = c(0.5, 0.75, 1.0)
)
xgb_tune3 <- caret::train(
x = train_x,
y = train_y,
trControl = tune_control,
tuneGrid = tune_grid3,
method = "xgbTree",
verbose = TRUE
)
tuneplot(xgb_tune3, probs = .95)
xgb_tune3$bestTune
set.seed(57)
omp_set_num_threads(2) # caret parallel processing threads
# gamma
tune_grid4 <- expand.grid(
nrounds = seq(from = 50, to = nrounds, by = 50),
eta = xgb_tune$bestTune$eta,
max_depth = xgb_tune2$bestTune$max_depth,
gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
colsample_bytree = xgb_tune3$bestTune$colsample_bytree,
min_child_weight = xgb_tune2$bestTune$min_child_weight,
subsample = xgb_tune3$bestTune$subsample
)
xgb_tune4 <- caret::train(
x = train_x,
y = train_y,
trControl = tune_control,
tuneGrid = tune_grid4,
method = "xgbTree",
verbose = TRUE
)
tuneplot(xgb_tune4)
xgb_tune4$bestTune
# Reducing the Learning Rate
tune_grid5 <- expand.grid(
nrounds = seq(from = 100, to = 10000, by = 100),
eta = c(0.01, 0.015, 0.025, 0.05, 0.1),
max_depth = xgb_tune2$bestTune$max_depth,
gamma = xgb_tune4$bestTune$gamma,
colsample_bytree = xgb_tune3$bestTune$colsample_bytree,
min_child_weight = xgb_tune2$bestTune$min_child_weight,
subsample = xgb_tune3$bestTune$subsample
)
xgb_tune5 <- caret::train(
x = train_x,
y = train_y,
trControl = tune_control,
tuneGrid = tune_grid5,
method = "xgbTree",
verbose = TRUE
)
tuneplot(xgb_tune5)
xgb_tune5$bestTune
# Fitting the Model
(final_grid <- expand.grid(
nrounds = xgb_tune5$bestTune$nrounds,
eta = xgb_tune5$bestTune$eta,
max_depth = xgb_tune5$bestTune$max_depth,
gamma = xgb_tune5$bestTune$gamma,
colsample_bytree = xgb_tune5$bestTune$colsample_bytree,
min_child_weight = xgb_tune5$bestTune$min_child_weight,
subsample = xgb_tune5$bestTune$subsample
))
(xgb_model <- caret::train(
x = train_x,
y = train_y,
trControl = train_control,
tuneGrid = final_grid,
method = "xgbTree",
verbose = TRUE
))
stopCluster(cl)
# apply model to the whole data set using xgboost
xgb_m <- xgb.DMatrix(data = data.matrix(block.data), label = block.data$ntl)
m = xgb.train(data = xgb_m,
max.depth = xgb_tune5$bestTune$max_depth,
# watchlist = watchlist,
nrounds = xgb_tune5$bestTune$nrounds,
min_child_weight = xgb_tune5$bestTune$min_child_weight,
subsample = xgb_tune5$bestTune$subsample,
eta = xgb_tune5$bestTune$eta,
gamma = xgb_tune5$bestTune$gamma,
colsample_bytree = xgb_tune5$bestTune$colsample_bytree,
objective = "reg:squarederror")
m
# export xgb residuals
xgb_resids = predict(m, xgb_m, na.rm = TRUE)
sb = c(ntl, pop_res, tirs_res, agbh_res)
xgb_resids = sb$ntl - xgb_resids
plot(xgb_resids)
The plot looks like this:
Obviously, I am doing something very wrong. How can I export the residuals of an XGBR as a single raster?
Here is a very small sample of my dataset:
block.data = structure(list(x = c(11880750L, 11879250L, 11879750L, 11880250L,
11880750L, 11881250L, 11879250L, 11879750L, 11880250L, 11880750L,
11881250L, 11878750L, 11879250L, 11879750L, 11880250L, 11880750L,
11881250L, 11879250L, 11879750L, 11880250L, 11880750L, 11881250L,
11881750L, 11882250L, 11879250L, 11879750L, 11880250L, 11880750L,
11881250L, 11881750L, 11882250L, 11882750L, 11879250L, 11879750L
), y = c(1802250L, 1801750L, 1801750L, 1801750L, 1801750L, 1801750L,
1801250L, 1801250L, 1801250L, 1801250L, 1801250L, 1800750L, 1800750L,
1800750L, 1800750L, 1800750L, 1800750L, 1800250L, 1800250L, 1800250L,
1800250L, 1800250L, 1800250L, 1800250L, 1799750L, 1799750L, 1799750L,
1799750L, 1799750L, 1799750L, 1799750L, 1799750L, 1799250L, 1799250L
), ntl = c(18.7969169616699, 25.7222957611084, 23.4188251495361,
25.4322757720947, 16.4593601226807, 12.7868213653564, 30.9337253570557,
29.865758895874, 30.4080600738525, 29.5479888916016, 24.3493347167969,
35.2427635192871, 38.989933013916, 34.6536979675293, 29.4607238769531,
30.7469024658203, 34.3946380615234, 42.8660278320312, 34.7930717468262,
30.9516315460205, 32.20654296875, 39.999755859375, 46.6002235412598,
38.6480979919434, 60.5214920043945, 33.1799964904785, 31.8498134613037,
30.9209423065186, 32.2269744873047, 53.7062034606934, 45.5225944519043,
38.3570976257324, 123.040382385254, 73.0528182983398), pop = c(19.6407718658447,
610.009216308594, 654.812622070312, 426.475830078125, 66.3839492797852,
10.6471328735352, 443.848846435547, 602.677429199219, 488.478454589844,
387.470947265625, 58.2341117858887, 413.888488769531, 315.057678222656,
354.082946777344, 602.827758789062, 463.518829345703, 296.713928222656,
923.920593261719, 434.436645507812, 799.562927246094, 404.709564208984,
265.043304443359, 366.697235107422, 399.851684570312, 952.2314453125,
870.356994628906, 673.406616210938, 493.521606445312, 273.841888427734,
371.428619384766, 383.057830810547, 320.986755371094, 991.131225585938,
1148.87768554688), tirs = c(39.7242431640625, 44.9583969116211,
41.4048385620117, 42.6056709289551, 40.0976028442383, 38.7490005493164,
44.2747650146484, 43.5645370483398, 41.6180191040039, 40.3799781799316,
38.8664817810059, 44.9089202880859, 44.414306640625, 44.560977935791,
43.1288986206055, 40.9315185546875, 38.8918418884277, 46.3063850402832,
45.5805702209473, 44.9196586608887, 42.2495613098145, 39.3051452636719,
38.7914810180664, 38.6069412231445, 44.6782455444336, 46.4024772644043,
44.4720573425293, 41.7361183166504, 42.3378067016602, 41.0018348693848,
39.3579216003418, 41.6303863525391, 43.8207550048828, 46.0460357666016
), agbh = c(3.32185006141663, 4.98925733566284, 4.35699367523193,
4.94798421859741, 3.14325952529907, 2.93211793899536, 4.52736520767212,
4.99723243713379, 5.13944292068481, 3.92965626716614, 3.43465113639832,
3.55617475509644, 3.4659411907196, 5.24469566345215, 5.36995029449463,
4.61549234390259, 4.82002925872803, 4.20452928543091, 4.71502685546875,
5.20452785491943, 5.05676746368408, 5.9952244758606, 6.16778612136841,
4.69053316116333, 2.62325501441956, 4.74775457382202, 4.93133020401001,
5.02366256713867, 5.74016952514648, 6.28353786468506, 4.67424774169922,
4.56812858581543, 1.88153350353241, 4.31531000137329)), class = "data.frame", row.names = c(NA,
-34L))
My raster layer:
r = new("RasterBrick", file = new(".RasterFile", name = "", datanotation = "FLT4S",
byteorder = "little", nodatavalue = -Inf, NAchanged = FALSE,
nbands = 1L, bandorder = "BIL", offset = 0L, toptobottom = TRUE,
blockrows = 0L, blockcols = 0L, driver = "", open = FALSE),
data = new(".MultipleRasterData", values = structure(c(NA,
NA, NA, NA, 18.7969169616699, NA, NA, NA, NA, NA, 25.7222957611084,
23.4188251495361, 25.4322757720947, 16.4593601226807, 12.7868213653564,
NA, NA, NA, NA, 30.9337253570557, 29.865758895874, 30.4080600738525,
29.5479888916016, 24.3493347167969, NA, NA, NA, 35.2427635192871,
38.989933013916, 34.6536979675293, 29.4607238769531, 30.7469024658203,
34.3946380615234, NA, NA, NA, NA, 42.8660278320312, 34.7930717468262,
30.9516315460205, 32.20654296875, 39.999755859375, 46.6002235412598,
38.6480979919434, NA, NA, 60.5214920043945, 33.1799964904785,
31.8498134613037, 30.9209423065186, 32.2269744873047, 53.7062034606934,
45.5225944519043, 38.3570976257324, NA, 123.040382385254,
73.0528182983398, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
19.6407718658447, NA, NA, NA, NA, NA, 610.009216308594, 654.812622070312,
426.475830078125, 66.3839492797852, 10.6471328735352, NA,
NA, NA, NA, 443.848846435547, 602.677429199219, 488.478454589844,
387.470947265625, 58.2341117858887, NA, NA, NA, 413.888488769531,
315.057678222656, 354.082946777344, 602.827758789062, 463.518829345703,
296.713928222656, NA, NA, NA, NA, 923.920593261719, 434.436645507812,
799.562927246094, 404.709564208984, 265.043304443359, 366.697235107422,
399.851684570312, NA, NA, 952.2314453125, 870.356994628906,
673.406616210938, 493.521606445312, 273.841888427734, 371.428619384766,
383.057830810547, 320.986755371094, NA, 991.131225585938,
1148.87768554688, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
39.7242431640625, NA, NA, NA, NA, NA, 44.9583969116211, 41.4048385620117,
42.6056709289551, 40.0976028442383, 38.7490005493164, NA,
NA, NA, NA, 44.2747650146484, 43.5645370483398, 41.6180191040039,
40.3799781799316, 38.8664817810059, NA, NA, NA, 44.9089202880859,
44.414306640625, 44.560977935791, 43.1288986206055, 40.9315185546875,
38.8918418884277, NA, NA, NA, NA, 46.3063850402832, 45.5805702209473,
44.9196586608887, 42.2495613098145, 39.3051452636719, 38.7914810180664,
38.6069412231445, NA, NA, 44.6782455444336, 46.4024772644043,
44.4720573425293, 41.7361183166504, 42.3378067016602, 41.0018348693848,
39.3579216003418, 41.6303863525391, NA, 43.8207550048828,
46.0460357666016, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
3.32185006141663, NA, NA, NA, NA, NA, 4.98925733566284, 4.35699367523193,
4.94798421859741, 3.14325952529907, 2.93211793899536, NA,
NA, NA, NA, 4.52736520767212, 4.99723243713379, 5.13944292068481,
3.92965626716614, 3.43465113639832, NA, NA, NA, 3.55617475509644,
3.4659411907196, 5.24469566345215, 5.36995029449463, 4.61549234390259,
4.82002925872803, NA, NA, NA, NA, 4.20452928543091, 4.71502685546875,
5.20452785491943, 5.05676746368408, 5.9952244758606, 6.16778612136841,
4.69053316116333, NA, NA, 2.62325501441956, 4.74775457382202,
4.93133020401001, 5.02366256713867, 5.74016952514648, 6.28353786468506,
4.67424774169922, 4.56812858581543, NA, 1.88153350353241,
4.31531000137329, NA, NA, NA, NA, NA, NA), .Dim = c(63L,
4L)), offset = 0, gain = 1, inmemory = TRUE, fromdisk = FALSE,
nlayers = 4L, dropped = NULL, isfactor = c(FALSE, FALSE,
FALSE, FALSE), attributes = list(), haveminmax = TRUE,
min = c(12.7868213653564, 10.6471328735352, 38.6069412231445,
1.88153350353241), max = c(123.040382385254, 1148.87768554688,
46.4024772644043, 6.28353786468506), unit = "", names = c("ntl",
"pop", "tirs", "agbh")), legend = new(".RasterLegend",
type = character(0), values = logical(0), color = logical(0),
names = logical(0), colortable = logical(0)), title = character(0),
extent = new("Extent", xmin = 11878500, xmax = 11883000,
ymin = 1799000, ymax = 1802500), rotated = FALSE, rotation = new(".Rotation",
geotrans = numeric(0), transfun = function ()
NULL), ncols = 9L, nrows = 7L, crs = new("CRS", projargs = NA_character_),
srs = character(0), history = list(), z = list())
I found this solution:
and then I
cbind
the d with the coordinates from my raster data and I create the residuals raster.