In the new version 2.0 of XGBoost we have quantile regression.
I would like to implement quantile regression on the older version xgboost 1 using a custom function for alpha_list = [0.05, 0.5, 0.95] where 0.05 is the lower bound, 0.5 is the median and 0.05 is the upper bound. Unfortunately I can't seem to get the correct implementation for this quantile function. I am assuming quantile regression using xgboost 1 is a common issue. How would you implement these quantiles correctly?
Below I have provided my best example using a different model, I want to however I want to use a custom function being passed as the objective to xgboost==1.6.3
pip install xgboost==1.6.3
Example
import xgboost as xgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import logging
import time
def uncertainty_model(
X_train=None,
categorical_features=None, # List of names of the categorical features
numeric_features=None, # List of names of the numeric features
y_train=None, # The target values
alpha_param=0.5, # The quantile to predict. 0.5 for median, <0.5 for lower quantiles, >0.5 for upper quantiles
):
# Log the beginning of the model fitting process, including the alpha parameter
logging.info(f"Fitting quantile model alpha={alpha_param}")
start_time = time.time()
# Combine categorical and numeric features into a single list
features = categorical_features + numeric_features
X_train = X_train.copy()
X_train[categorical_features] = X_train[categorical_features].astype("category")
X_train[numeric_features] = X_train[numeric_features].astype("float64")
# Define a pipeline for processing categorical features:
categorical_transformer = make_pipeline(
OneHotEncoder(),
SimpleImputer(strategy="most_frequent"),
)
# Define a pipeline for processing numeric features:
numeric_transformer = make_pipeline(
StandardScaler(), SimpleImputer(strategy="median")
)
# Combine the transformers for numeric and categorical features using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
#Define the model to use GradientBoostingRegressor configured for quantile regression with the specified alpha parameter
#This is where our problem is, I want to replace this model with xgboost 1.6.5's xgboost.XGBRegressor
model = GradientBoostingRegressor(
loss="quantile", # Use quantile loss for quantile regression
max_depth=5,
alpha=alpha_param # The quantile to predict
)
# Replace GradientBoostingRegressor with xgboost.XGBRegressor for quantile regression
pipeline = make_pipeline(preprocessor, model)
pipeline.fit(X_train[features], y_train)
end_time = time.time()
logging.info(f"Time taken: {end_time - start_time:.6f} seconds")
# Return the fitted pipeline
return pipeline
# Generate synthetic data
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)
X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
y_series = pd.Series(y, name='target')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_series, test_size=0.2, random_state=42)
# Assuming all features are numeric for this example
numeric_features = X_df.columns.tolist()
categorical_features = []
# List of alpha values to test
alpha_list = [0.05, 0.5, 0.95]
predictions = {} # Placeholder for predictions
# Iterate over the list of alpha values and fit a model for each
for alpha in alpha_list:
print(f"Testing alpha={alpha}")
pipeline = uncertainty_model(
X_train=X_train,
categorical_features=categorical_features,
numeric_features=numeric_features,
y_train=y_train,
alpha_param=alpha
)
# Make predictions on the test set
y_pred = pipeline.predict(X_test)
# Calculate and print the Mean Absolute Error (MAE) for evaluation
mae = mean_absolute_error(y_test, y_pred)
print(f"Alpha: {alpha}, MAE: {mae:.4f}\n")
predictions[alpha] = y_pred
# Plotting the predictions
plt.figure(figsize=(10, 6))
x_axis = np.arange(len(X_test))
for alpha, pred in predictions.items():
plt.plot(x_axis, pred, label=f'Alpha = {alpha}')
plt.title('Quantile Regression Predictions ')
plt.xlabel('Index')
plt.ylabel('Predicted Value')
plt.legend()
plt.show()