XGBoost predict on all data

1.1k Views Asked by At

i have dataset with predictors, it has 35446 rows and 38 columns with target.

I make train and test sets:

data_for_predict <- res
data_good <- data_for_predict%>%filter(target == 1)
data_bad <- data_for_predict%>%filter(target == 0)
set.seed(789)
size_bad <- floor(1 * nrow(data_good))
data_ind <- sample(seq_len(nrow(data_bad)), size = size_bad)
data_bad <- data_bad[data_ind, ]
data_for_predict <- rbind(data_good, data_bad)
data_for_predict <- data_for_predict[sample(1:nrow(data_for_predict)),]
goal <- as.data.frame(data_for_predict$target)
data_for_predict <- data_for_predict%>%select(-target)

After that, i want to reduce the dimensionality of the data with PCA.

 PCA <- prcomp(data_for_predict, scale. = TRUE)
 PCA <- as.data.frame(PCA$x)
 data_for_predict <- cbind(data_for_predict, PCA)
 data_for_predict <- as.data.frame(data_for_predict)
 data_for_predict$target <- target$`data_for_predict$target`

Then I break the data into the training and test samples

smp_size <- floor(0.8 * nrow(data_for_predict))
set.seed(123)
train_ind <- sample(seq_len(nrow(data_for_predict)), size = smp_size)

train <- data_for_predict[train_ind, ]
rownames(train) <- seq(length=nrow(train)) 
test <- data_for_predict[-train_ind, ]
rownames(test) <- seq(length=nrow(test)) 
names(test) <- make.names(names(test))
names(train) <- make.names(names(train))

Now I prepare the data for training

setDT(train)
setDT(test)
labels <- train$target
ts_label <- test$target
new_tr <- model.matrix(~.+0,data = train[,-c("target"),with=F]) 
new_ts <- model.matrix(~.+0,data = test[,-c("target"),with=F])
dtrain <- xgb.DMatrix(data = new_tr,label = labels) 
dtest <- xgb.DMatrix(data = new_ts,label=ts_label)

And fit:

params <- list(booster = "gbtree", objective = "binary:logistic", eta=0.3, gamma=0, max_depth=10, min_child_weight=1, subsample=1, colsample_bytree=1)


xgbcv <- xgb.cv(params = params, data = dtrain, nrounds = 1000, nfold = 5, showsd = T, stratified = T, print_every_n = 10, 
                early_stopping_round = 20, maximize = F, eval_metric = "error")
xgb1 <- xgb.train(params = params, data = dtrain, nrounds = 46, watchlist = list(val=dtest,train=dtrain), print_every_n = 10, 
                   maximize = F , eval_metric = "error")

xgbpred <- predict(xgb1, dtest, type = "response")
xgbpred <- ifelse(xgbpred > 0.77,1,0)

confusionMatrix(xgbpred, ts_label)

I get a good result:

Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1569   90
         1  102 1583

               Accuracy : 0.9426             
                 95% CI : (0.9342, 0.9502)   
    No Information Rate : 0.5003             
    P-Value [Acc > NIR] : <0.0000000000000002

                  Kappa : 0.8852             
 Mcnemar's Test P-Value : 0.4273             

            Sensitivity : 0.9390             
            Specificity : 0.9462             
         Pos Pred Value : 0.9458             
         Neg Pred Value : 0.9395             
             Prevalence : 0.4997             
         Detection Rate : 0.4692             
   Detection Prevalence : 0.4961             
      Balanced Accuracy : 0.9426             

       'Positive' Class : 0    

But if I want to predict the entire dataset (35446 rows and 38 columns) based on the model obtained, i've got:

Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 22386  5328
         1  4701  3031

               Accuracy : 0.7171          
                 95% CI : (0.7123, 0.7217)
    No Information Rate : 0.7642          
    P-Value [Acc > NIR] : 1               

                  Kappa : 0.1941          
 Mcnemar's Test P-Value : 0.000000000408  

            Sensitivity : 0.8264          
            Specificity : 0.3626          
         Pos Pred Value : 0.8078          
         Neg Pred Value : 0.3920          
             Prevalence : 0.7642          
         Detection Rate : 0.6316          
   Detection Prevalence : 0.7819          
      Balanced Accuracy : 0.5945          

       'Positive' Class : 0 

Why is the error reduced if the model is built on the same data?

0

There are 0 best solutions below