In this notebook we will explore the explanability of the binary classifier xgboost classifier.
import time,os,json
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') # random state
SEED=100
time_start_notebook = time.time()
home = os.path.expanduser('~')
[(x.__name__,x.__version__) for x in [np,pd,sns]]
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## model evaluation
!pip install -U watermark
!pip install -U xgboost
!pip install -U eli5
!pip install -U shap
!pip install -U pdpbox
!pip install -U yellowbrick
!pip install -U lime
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
%load_ext watermark
%watermark -a "Bhishan Poudel" -d
%watermark -v -m -p numpy,scipy,pandas,seaborn,sklearn,xgboost,eli5,shap,pdpbox,yellowbrick -g
def get_data():
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
df = df.copy()
columns_to_drop = ['Id', 'Medical_History_10','Medical_History_24']
df = df.drop(columns_to_drop,axis=1)
df['Product_Info_2_char'] = df.Product_Info_2.str[0]
df['Product_Info_2_num'] = df.Product_Info_2.str[1]
# factorize categorical variables
df['Product_Info_2'] = pd.factorize(df['Product_Info_2'])[0]
df['Product_Info_2_char'] = pd.factorize(df['Product_Info_2_char'])[0]
df['Product_Info_2_num'] = pd.factorize(df['Product_Info_2_num'])[0]
df['BMI_Age'] = df['BMI'] * df['Ins_Age']
med_keyword_columns = df.columns[df.columns.str.startswith('Medical_Keyword_')]
df['Med_Keywords_Count'] = df[med_keyword_columns].sum(axis=1)
df = df.fillna(-1)
return df
df = get_data()
print(df.shape)
df.isna().sum().sum(), df.sum().sum()
from sklearn.model_selection import train_test_split
target = 'Response'
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
df.drop(target,axis=1), df[target],
test_size=0.2, random_state=SEED, stratify=df[target])
ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
features_train = df_Xtrain.columns.to_list()
features_train
import xgboost as xgb
from xgboost import XGBClassifier
xgb.__version__
clf_xgb = XGBClassifier(objective= 'multi:softprob', random_state=SEED,n_jobs=-1)
clf_xgb
%%time
clf_xgb.fit(df_Xtrain, ser_ytrain)
ypreds = clf_xgb.predict(df_Xtest)
df_ypreds = pd.DataFrame({'ytest': ytest, 'ypreds': ypreds})
df_ypreds['is_Response8'] = df_ypreds['ytest'].eq(8).astype(int)
df_ypreds.head(10)
# xgb.plot_importance?
# importance_type = 'weight', 'gain', 'cover'
# feature importance
fig,ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(clf_xgb,ax=ax,max_num_features=10,importance_type='gain')
plt.show()
df_imp = pd.DataFrame({'feature': features_train,
'importance': clf_xgb.feature_importances_})
df_imp.sort_values('importance', ascending=False)\
.head(10).style.background_gradient(subset=['importance'])
df_imp.sort_values('importance', ascending=False)\
.head(10).style.bar(subset=['importance'],align='mid',color='pink')
from yellowbrick.model_selection import FeatureImportances
# FeatureImportances?
# plt.colormaps()
plt.figure(figsize=(8,24))
viz = FeatureImportances(clf_xgb,colormap='hot_r')
viz.fit(df_Xtrain, ytrain)
viz.show()
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(clf_xgb).fit(df_Xtest, ser_ytest)
eli5.show_weights(perm, feature_names = df_Xtrain.columns.tolist(), top=50)
eli5.explain_weights_df(perm, feature_names=features_train)\
.head(10).style.background_gradient(subset=['weight'])
eli5.show_prediction(clf_xgb, df_Xtest.iloc[0,:],show_feature_values=True)
pdp.pdp_isolate(model, dataset, model_features,
feature, num_grid_points=10,
grid_type='percentile', percentile_range=None,
grid_range=None, cust_grid_points=None,
memory_limit=0.5, n_jobs=1, predict_kwds=None,
data_transformer=None)
make sure n_jobs=1 when you are using XGBoost model.
import pdpbox
from pdpbox import pdp
from pdpbox import info_plots
pdpbox.__version__
feature = 'BMI'
pdp.pdp_isolate(model, dataset, model_features,
feature, num_grid_points=10,
grid_type='percentile', percentile_range=None,
grid_range=None, cust_grid_points=None,
memory_limit=0.5, n_jobs=1, predict_kwds=None,
data_transformer=None)
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999
# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=clf_xgb,
dataset=df_Xtest,
model_features=df_Xtest.columns.tolist(),
feature=feature,
n_jobs=1) # make sure n_jobs=1 when you are using XGBoost model.
# plot it
pdp.pdp_plot(pdp_goals, feature)
plt.show()
"""
Assume Respons8 means policy accepted.
Look at class 7 (Response8)
The bmi line is below the red dotted line ==> higher bmi lower chance of policy being granted
0 to 0.4 ==> bmi increse has not much effect
0.4 to 0.6 ==> higher bmi higher rejection
0.6+ ==> the value saturates.
""";
np.unique(df[target])
"""
Our target names are 1-8, but default pdp-box class names are 0-7.
""";
df_target_encoded = pd.get_dummies(df,columns=[target],drop_first=False)
df_target_encoded.iloc[:2,-10:]
features_train = df.columns.drop(target)
features_train
target_cols = [f'Response_{i}' for i in range(1,9)]
target_cols
fig, axes, summary_df = info_plots.target_plot(
df=df_target_encoded,
feature=feature,
feature_name=feature,
target=['Response_8']
)
# we can see when bmi increases, then average response8 decreases.
# high bmi ==> low response8 ==> low acceptance.
summary_df
info_plots.actual_plot(model, X, feature,
feature_name, num_grid_points=10,
grid_type='percentile', percentile_range=None,
grid_range=None, cust_grid_points=None,
show_percentile=False, show_outliers=False,
endpoint=True, which_classes=None,
predict_kwds=None, ncols=2, figsize=None,
plot_params=None)
Parameters
----------
model: a fitted sklearn model
X: pandas DataFrame
data set on which the model is trained
which_classes: list, optional, default=None
which classes to plot, only use when it is a multi-class problem
fig, axes, summary_df = info_plots.actual_plot(
model=clf_xgb,
X=df_Xtrain,
feature=feature,
feature_name=feature,
which_classes=[0],
predict_kwds={},
)
%%time
pdp_bmi_xgboost = pdp.pdp_isolate(
model=clf_xgb,
dataset=df_target_encoded,
model_features=features_train,
feature=feature
)
fig, axes = pdp.pdp_plot(
pdp_isolate_out=pdp_bmi_xgboost,
feature_name=feature,
center=True,
x_quantile=True,
ncols=3,
plot_lines=True,
frac_to_plot=100
)
fig, axes = pdp.pdp_plot(
pdp_bmi_xgboost,
feature,
center=True,
x_quantile=True,
ncols=1,
plot_lines=True,
frac_to_plot=100,
which_classes=[7],
plot_pts_dist=True
)
"""
Important Observation:
- All the bmi bins have sufficient persons. Certain range of bmi is NOT missing.
- bmi curve is below the red dotted line ==> negative impact with prediction
- bmi increase upto 0.44 has almost no impact.
- bmi increase 0.44 - 0.55 ==> less policy accepted
- bmi increase 0.55+ ==> saturates and still less policy accepted.
""";
df.iloc[:2,-10:]
two_features = ['BMI', 'Medical_History_4']
fig, axes, summary_df = info_plots.target_plot_interact(
df=df_target_encoded,
features=two_features,
feature_names=two_features,
target='Response_8'
)
fig, axes, summary_df = info_plots.actual_plot_interact(
model=clf_xgb,
X=df_Xtrain,
features=two_features,
feature_names=two_features,
ncols=1,
which_classes=[7]
)
SHAP = SHapley Additive exPlanations
import shap
shap.__version__
explainer = shap.TreeExplainer(clf_xgb)
shap_values = explainer.shap_values(df_Xtest)
type(shap_values), type(explainer.expected_value), type(shap_values[0])
np.array(shap_values).shape
np.array(explainer.expected_value).shape
df_Xtest.shape
cmap = plt.get_cmap("tab10")
colors = cmap.colors # tuple of tuples
# colors = sns.color_palette('husl',8)
# colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)] # etc
# get class ordering from shap values
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
# create listed colormap
from matplotlib import colors as plt_colors
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, df_Xtest,
class_names = [f'Response_{i+1}' for i in range(8)],
color=cmap,
plot_type="bar")
targetNum = 7
title = " "*20 + f"SHAP plot for Response {targetNum+1}"
print(title)
shap.summary_plot(shap_values[targetNum], df_Xtest,
title = title, # title dont work.
class_names = [f'Response_{targetNum+1}'],
color=colors[targetNum], # NOTE: colors[0] not colors[1] for Response_1
plot_type="bar",
plot_size = (12,8)
)
"""
Note: By default the colors in multiclass shap summary plot
are determined by mean of shap values.
They are NOT in the same order given in cmap.colors.
""";
shap.summary_plot(shap_values[targetNum], df_Xtest)
"""
Important Observations:
Lets assume that response_8 is the policy grant and other response(1-7) are reject.
1. Features are sorted in descending order of its importance.
2. BMI has High (red in colour) and negative (less than 0) effect on the target.
This means higher the BMI, higher the rejection.
3. Conversely, Med Hist 4 has High (red) and positive (greater than 0)
effect on the target.
This means that the higher the value of Med Hist 4,
the chances are higher for policy getting accepted.
""";
# shap.force_plot(explainer.expected_value[0],
# shap_values[0],
# matplotlib=False,
# text_rotation=90)
# this is too slow, we can use only first 1000 rows.
shap_values[0].shape
x = shap_values[0][:100,:]
x.shape
explainer.expected_value[7]
# target = Response_1
# row = only first few rows
targetNum = 7 # 7 is Response_8
shap.initjs()
shap.force_plot(explainer.expected_value[targetNum],
shap_values[targetNum][:100,:], # take only first N rows
feature_names = features_train,
matplotlib=False,
text_rotation=90)
# target = Response_1
# row = only one row (only one customer)
shap.initjs()
rowNum = 0
targetNum = 7 # 7 is Response_8
print(f'Average shap for the group: {explainer.expected_value[targetNum]}')
shap.force_plot(explainer.expected_value[targetNum],
shap_values[targetNum][rowNum,:],
feature_names = features_train,
matplotlib=True,
text_rotation=90 # only works when matplotlib=True
)
"""
We can visualize feature attributions such as Shapley values as "forces".
Each feature value is a force that either increases or decreases the prediction.
The prediction starts from the baseline.
The baseline for Shapley values is the average of all predictions.
In the plot, each Shapley value is an arrow that pushes to increase
(positive value) or decrease (negative value) the prediction.
These forces balance each other out at the actual prediction of the data instance.
The baseline -- the average predicted probability -- is 1.53.
The person has prediction = -1.71 < baseline
The person's application likely NOT to be accepted.
MH = medical history
MK = medical keyword
From summary_plot:
higher is better: MH_4_23_33_40
lower is better : MH_30_32 MK_13_3 Insured_info_2_7 bmi wt bmi_wt
Although medical_history_23 is high for this particular person, the features
such as wt bmi_age are high and they offset the goodness of high
medical_history_23 and overall result is bad.
The persons application is not likely to be accepted.
""";
df_ypreds.head(6)
# target = Response_1
# row = only one row (only one customer)
shap.initjs()
rowNum = 2
targetNum = 7 # 7 is Response_8
print(f'Average shap for the group: {explainer.expected_value[targetNum]}')
shap.force_plot(explainer.expected_value[targetNum],
shap_values[targetNum][rowNum,:],
feature_names = features_train,
matplotlib=True,
text_rotation=90 # only works when matplotlib=True
)
"""
The baseline -- the average predicted probability -- is 1.53.
The person has prediction = 1.86
which is much higher than the baseline 1.53.
MH = medical history
MK = medical keyword
From summary_plot:
higher is better: MH_4_23_33_40
lower is better : MH_30_32 MK_13_3 Insured_info_2_7 bmi wt bmi_wt
Effect is GOOD. The person's application is likely to be accepted.
""";
shap.dependence_plot(feature, shap_values[0], df_Xtest)
# features_train
shap.dependence_plot(feature, shap_values[0], df_Xtest,
interaction_index='Med_Keywords_Count')
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))