I have a piece of biological data with 200 samples and 500 features. After modeling with the mlr3 framework, I selected random forest or XGB as the final model. What should I do to calculate the shap value for each sample or test set samples?
learner_ranger = po("colapply",
id ="int_to_num",
applicator = as.numeric,
affect_columns = selector_type("integer")) %>>%
po("smote",dup_size=1) %>>%
po("learner",lrn("classif.ranger",num.trees = 1000,predict_type = "prob")) %>%
as_learner()
learner_ranger$param_set$values =
learner_smotesample_randomForest$tuning_result$learner_param_vals[[1]]
split = partition(task = task)
train_set = split$train
test_set = split$test
fit_ranger = learner_ranger$train(task = task,
row_ids = train_set)
library(kernelshap)
s <- kernelshap(fit_ranger, data[,-ncol(data)],data)
# runs very slowly!!!!!!
s1 <- treeshap(fit_ranger, data[,-ncol(data)])
# error no applicable method for 'treeshap' applied to an object of class "c('GraphLearner', 'Learner', 'R6')"
but I run it faster on python!!
data = pd.read_excel(r'D:\project\SHAP_test\data_final.xlsx',index_col=0)
X, y = data.iloc[:, 0:502].values, data.iloc[:, 502].values
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
my_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)
import shap # package used to calculate Shap values
explainer = shap.TreeExplainer(my_model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values[0], X)