Description¶

In this notebook we will explore the explanability of the binary classifier xgboost classifier.

Imports¶

import time,os,json
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') # random state
SEED=100
time_start_notebook = time.time()
home = os.path.expanduser('~')
[(x.__name__,x.__version__) for x in [np,pd,sns]]

[('numpy', '1.18.4'), ('pandas', '1.0.3'), ('seaborn', '0.10.1')]

%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## model evaluation
    !pip install -U watermark
    !pip install -U xgboost
    !pip install -U eli5
    !pip install -U shap
    !pip install -U pdpbox
    !pip install -U yellowbrick
    !pip install -U lime

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.

%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

%load_ext watermark
%watermark -a "Bhishan Poudel" -d
%watermark -v -m -p numpy,scipy,pandas,seaborn,sklearn,xgboost,eli5,shap,pdpbox,yellowbrick -g

Bhishan Poudel 2020-06-22
CPython 3.7.7
IPython 7.13.0

numpy 1.18.4
scipy 1.4.1
pandas 1.0.3
seaborn 0.10.1
sklearn 0.0
xgboost 0.90
eli5 0.10.1
shap 0.35.0
pdpbox 0.2.0
yellowbrick 1.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
Git hash   : 607b934d288205dad9565d6ebad47ae369cfe158

Load the data¶

def get_data():
    df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
    df = df.copy()
    columns_to_drop = ['Id', 'Medical_History_10','Medical_History_24']
    df = df.drop(columns_to_drop,axis=1)
    df['Product_Info_2_char'] = df.Product_Info_2.str[0]
    df['Product_Info_2_num'] = df.Product_Info_2.str[1]

    # factorize categorical variables
    df['Product_Info_2'] = pd.factorize(df['Product_Info_2'])[0]
    df['Product_Info_2_char'] = pd.factorize(df['Product_Info_2_char'])[0]
    df['Product_Info_2_num'] = pd.factorize(df['Product_Info_2_num'])[0]

    df['BMI_Age'] = df['BMI'] * df['Ins_Age']

    med_keyword_columns = df.columns[df.columns.str.startswith('Medical_Keyword_')]
    df['Med_Keywords_Count'] = df[med_keyword_columns].sum(axis=1)
    df = df.fillna(-1)

    return df

df = get_data()
print(df.shape)
df.isna().sum().sum(), df.sum().sum()

(59381, 129)

(0, 26897356.818315115)

Train-test Split with Stratify¶

from sklearn.model_selection import train_test_split

target = 'Response'

df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],
    test_size=0.2, random_state=SEED, stratify=df[target])

ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

features_train = df_Xtrain.columns.to_list()
features_train

Modelling xgboost classifier¶

import xgboost as xgb
from xgboost import XGBClassifier

xgb.__version__

'0.90'

clf_xgb = XGBClassifier(objective= 'multi:softprob', random_state=SEED,n_jobs=-1)
clf_xgb

XGBClassifier(n_jobs=-1, objective='multi:softprob', random_state=100)

%%time
clf_xgb.fit(df_Xtrain, ser_ytrain)

CPU times: user 3min 18s, sys: 665 ms, total: 3min 19s
Wall time: 3min 36s

XGBClassifier(n_jobs=-1, objective='multi:softprob', random_state=100)

ypreds = clf_xgb.predict(df_Xtest)

df_ypreds = pd.DataFrame({'ytest': ytest, 'ypreds': ypreds})
df_ypreds['is_Response8'] = df_ypreds['ytest'].eq(8).astype(int)
df_ypreds.head(10)

Feature Importances¶

# xgb.plot_importance?

# importance_type = 'weight', 'gain', 'cover'

# feature importance
fig,ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(clf_xgb,ax=ax,max_num_features=10,importance_type='gain')
plt.show()

df_imp = pd.DataFrame({'feature': features_train,
                      'importance': clf_xgb.feature_importances_})

df_imp.sort_values('importance', ascending=False)\
  .head(10).style.background_gradient(subset=['importance'])

df_imp.sort_values('importance', ascending=False)\
  .head(10).style.bar(subset=['importance'],align='mid',color='pink')

from yellowbrick.model_selection import FeatureImportances

# FeatureImportances?

# plt.colormaps()

plt.figure(figsize=(8,24))
viz = FeatureImportances(clf_xgb,colormap='hot_r')
viz.fit(df_Xtrain, ytrain)
viz.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc0940bb550>

Model Evaluation: using eli5¶

import eli5
from eli5.sklearn import PermutationImportance

eli5: Permutation Importance show weights¶

perm = PermutationImportance(clf_xgb).fit(df_Xtest, ser_ytest)

eli5.show_weights(perm, feature_names = df_Xtrain.columns.tolist(), top=50)

eli5: explain weights¶

eli5.explain_weights_df(perm, feature_names=features_train)\
  .head(10).style.background_gradient(subset=['weight'])

eli5: show prediction¶

eli5.show_prediction(clf_xgb, df_Xtest.iloc[0,:],show_feature_values=True)

Model Evaluation: using PDP¶

https://github.com/SauceCat/PDPbox/blob/master/tutorials/pdpbox_multiclass_classification.ipynb

pdp.pdp_isolate(model, dataset, model_features,
feature, num_grid_points=10, 
grid_type='percentile', percentile_range=None,
grid_range=None, cust_grid_points=None, 
memory_limit=0.5, n_jobs=1, predict_kwds=None, 
data_transformer=None)

make sure n_jobs=1 when you are using XGBoost model.

import pdpbox
from pdpbox import pdp
from pdpbox import info_plots

pdpbox.__version__

'0.2.0'

feature = 'BMI'

PDP: pdp isolate¶

pdp.pdp_isolate(model, dataset, model_features,
feature, num_grid_points=10,
grid_type='percentile', percentile_range=None, 
grid_range=None, cust_grid_points=None, 
memory_limit=0.5, n_jobs=1, predict_kwds=None,
data_transformer=None)

%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=clf_xgb,
                dataset=df_Xtest,
                model_features=df_Xtest.columns.tolist(),
                feature=feature,
                n_jobs=1) # make sure n_jobs=1 when you are using XGBoost model.

# plot it
pdp.pdp_plot(pdp_goals, feature)
plt.show()

"""
Assume Respons8 means policy accepted.

Look at class 7 (Response8)

The bmi line is below the red dotted line ==> higher bmi lower chance of policy being granted

0 to 0.4   ==> bmi increse has not much effect
0.4 to 0.6 ==> higher bmi higher rejection
0.6+       ==> the value saturates.

""";

np.unique(df[target])

array([1, 2, 3, 4, 5, 6, 7, 8])

"""
Our target names are 1-8, but default pdp-box class names are 0-7.

""";

df_target_encoded = pd.get_dummies(df,columns=[target],drop_first=False)
df_target_encoded.iloc[:2,-10:]

features_train = df.columns.drop(target)
features_train

Index(['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_4',
       'Product_Info_5', 'Product_Info_6', 'Product_Info_7', 'Ins_Age', 'Ht',
       'Wt',
       ...
       'Medical_Keyword_43', 'Medical_Keyword_44', 'Medical_Keyword_45',
       'Medical_Keyword_46', 'Medical_Keyword_47', 'Medical_Keyword_48',
       'Product_Info_2_char', 'Product_Info_2_num', 'BMI_Age',
       'Med_Keywords_Count'],
      dtype='object', length=128)

target_cols = [f'Response_{i}' for i in range(1,9)]
target_cols

fig, axes, summary_df = info_plots.target_plot(
    df=df_target_encoded,
    feature=feature,
    feature_name=feature, 
    target=['Response_8']
)

# we can see when bmi increases, then average response8 decreases.
# high bmi ==> low response8 ==> low acceptance.

summary_df

check prediction distribution¶

info_plots.actual_plot(model, X, feature,
feature_name, num_grid_points=10,
grid_type='percentile', percentile_range=None,
grid_range=None, cust_grid_points=None,
show_percentile=False, show_outliers=False,
endpoint=True, which_classes=None,
predict_kwds=None, ncols=2, figsize=None, 
plot_params=None)

Parameters
----------

model: a fitted sklearn model
X: pandas DataFrame
    data set on which the model is trained
which_classes: list, optional, default=None
    which classes to plot, only use when it is a multi-class problem

fig, axes, summary_df = info_plots.actual_plot(
    model=clf_xgb, 
    X=df_Xtrain, 
    feature=feature, 
    feature_name=feature, 
    which_classes=[0],
    predict_kwds={},
)

partial dependence plot (pdp)¶

%%time 
pdp_bmi_xgboost = pdp.pdp_isolate(
    model=clf_xgb,
    dataset=df_target_encoded, 
    model_features=features_train, 
    feature=feature
)

CPU times: user 22.5 s, sys: 312 ms, total: 22.8 s
Wall time: 23.5 s

fig, axes = pdp.pdp_plot(
    pdp_isolate_out=pdp_bmi_xgboost, 
    feature_name=feature, 
    center=True, 
    x_quantile=True, 
    ncols=3, 
    plot_lines=True, 
    frac_to_plot=100
)

fig, axes = pdp.pdp_plot(
    pdp_bmi_xgboost, 
    feature, 
    center=True, 
    x_quantile=True, 
    ncols=1, 
    plot_lines=True, 
    frac_to_plot=100,
    which_classes=[7], 
    plot_pts_dist=True
)

"""
Important Observation:

- All the bmi bins have sufficient persons. Certain range of bmi is NOT missing.
- bmi curve is below the red dotted line ==> negative impact with prediction
- bmi increase upto 0.44 has almost no impact.
- bmi increase 0.44 - 0.55 ==> less policy accepted
- bmi increase 0.55+       ==> saturates and still less policy accepted.

""";

Interaction between two variables: bmi and Medical_History_4 with Target¶

df.iloc[:2,-10:]

two_features = ['BMI', 'Medical_History_4']

fig, axes, summary_df = info_plots.target_plot_interact(
    df=df_target_encoded,
    features=two_features,
    feature_names=two_features, 
    target='Response_8'
)

prediction distribution through feature combination of 'BMI' and 'Medical_History_4'¶

fig, axes, summary_df = info_plots.actual_plot_interact(
    model=clf_xgb, 
    X=df_Xtrain, 
    features=two_features, 
    feature_names=two_features,
    ncols=1,
    which_classes=[7]
)

Model Evaluation: plots using SHAP¶

SHAP = SHapley Additive exPlanations

import shap
shap.__version__

'0.35.0'

Get SHAP Values¶

explainer = shap.TreeExplainer(clf_xgb)
shap_values = explainer.shap_values(df_Xtest)

type(shap_values), type(explainer.expected_value), type(shap_values[0])

(list, list, numpy.ndarray)

np.array(shap_values).shape

(8, 11877, 128)

 np.array(explainer.expected_value).shape

(8,)

df_Xtest.shape

(11877, 128)

SHAP: Summary Plot¶

cmap = plt.get_cmap("tab10")
colors = cmap.colors # tuple of tuples
# colors = sns.color_palette('husl',8)
# colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)] # etc

# get class ordering from shap values
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])

# create listed colormap
from matplotlib import colors as plt_colors
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])

shap.summary_plot(shap_values, df_Xtest,
        class_names = [f'Response_{i+1}' for i in range(8)],
        color=cmap,
        plot_type="bar")

targetNum = 7
title = "  "*20 + f"SHAP plot for Response {targetNum+1}"
print(title)
shap.summary_plot(shap_values[targetNum], df_Xtest,
        title = title, # title dont work.
        class_names = [f'Response_{targetNum+1}'],
        color=colors[targetNum], # NOTE: colors[0] not colors[1] for Response_1
        plot_type="bar",
        plot_size = (12,8)
        )

"""
Note: By default the colors in multiclass shap summary plot
are determined by mean of shap values.

They are NOT in the same order given in cmap.colors.

""";

                                        SHAP plot for Response 8

shap.summary_plot(shap_values[targetNum], df_Xtest)

"""
Important Observations:

Lets assume that response_8 is the policy grant and other response(1-7) are reject.

1. Features are sorted in descending order of its importance.
2. BMI has High (red in colour) and negative (less than 0) effect on the target.
   This means higher the BMI, higher the rejection.
3. Conversely, Med Hist 4 has High (red) and positive (greater than 0)
   effect on the target. 
   This means that the higher the value of Med Hist 4, 
   the chances are higher for policy getting accepted.

""";

SHAP: Force Plot¶

https://christophm.github.io/interpretable-ml-book/shap.html

# shap.force_plot(explainer.expected_value[0],
#                 shap_values[0],
#                 matplotlib=False,
#                 text_rotation=90)

# this is too slow, we can use only first 1000 rows.

shap_values[0].shape

(11877, 128)

x = shap_values[0][:100,:]
x.shape

(100, 128)

explainer.expected_value[7]

1.5381206

# target = Response_1
# row = only first few rows
targetNum = 7 # 7 is Response_8
shap.initjs()
shap.force_plot(explainer.expected_value[targetNum],
                shap_values[targetNum][:100,:], # take only first N rows
                feature_names = features_train,
                matplotlib=False,
                text_rotation=90)

# target = Response_1
# row = only one row (only one customer)
shap.initjs()
rowNum = 0
targetNum = 7 # 7 is Response_8

print(f'Average shap for the group: {explainer.expected_value[targetNum]}')

shap.force_plot(explainer.expected_value[targetNum],
                shap_values[targetNum][rowNum,:],
                feature_names = features_train,
                matplotlib=True,
                text_rotation=90 # only works when matplotlib=True
                )

Average shap for the group: 1.5381206274032593

"""
We can visualize feature attributions such as Shapley values as "forces". 
Each feature value is a force that either increases or decreases the prediction.
The prediction starts from the baseline. 
The baseline for Shapley values is the average of all predictions. 
In the plot, each Shapley value is an arrow that pushes to increase 
(positive value) or decrease (negative value) the prediction.
These forces balance each other out at the actual prediction of the data instance.


The baseline -- the average predicted probability -- is 1.53.
The person has prediction =  -1.71 < baseline
The person's application likely NOT to be accepted.

MH = medical history
MK = medical keyword

From summary_plot:
higher is better:  MH_4_23_33_40
lower is better :  MH_30_32 MK_13_3 Insured_info_2_7 bmi wt bmi_wt

Although medical_history_23 is high for this particular person, the features
such as wt bmi_age are high and they offset the goodness of high
medical_history_23 and overall result is bad.

The persons application is not likely to be accepted.
""";

df_ypreds.head(6)

# target = Response_1
# row = only one row (only one customer)
shap.initjs()
rowNum = 2
targetNum = 7 # 7 is Response_8

print(f'Average shap for the group: {explainer.expected_value[targetNum]}')

shap.force_plot(explainer.expected_value[targetNum],
                shap_values[targetNum][rowNum,:],
                feature_names = features_train,
                matplotlib=True,
                text_rotation=90 # only works when matplotlib=True
                )

Average shap for the group: 1.5381206274032593

"""

The baseline -- the average predicted probability -- is 1.53.
The person has prediction =  1.86
which is much higher than the baseline 1.53.

MH = medical history
MK = medical keyword

From summary_plot:
higher is better:  MH_4_23_33_40
lower is better :  MH_30_32 MK_13_3 Insured_info_2_7 bmi wt bmi_wt

Effect is GOOD. The person's application is likely to be accepted.

""";

SHAP: Dependence Plot¶

shap.dependence_plot(feature, shap_values[0], df_Xtest)

# features_train

shap.dependence_plot(feature, shap_values[0], df_Xtest,
                    interaction_index='Med_Keywords_Count')

Time Taken¶

time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 15 min 8 secs

	feature	importance
58	Medical_History_23	0.122207
90	Medical_Keyword_15	0.080580
40	Medical_History_4	0.076142
10	BMI	0.050464
50	Medical_History_15	0.040969
78	Medical_Keyword_3	0.031732
127	Med_Keywords_Count	0.030037
3	Product_Info_4	0.028124
12	Employment_Info_2	0.021935
74	Medical_History_40	0.021649

	feature	importance
58	Medical_History_23	0.122207
90	Medical_Keyword_15	0.080580
40	Medical_History_4	0.076142
10	BMI	0.050464
50	Medical_History_15	0.040969
78	Medical_Keyword_3	0.031732
127	Med_Keywords_Count	0.030037
3	Product_Info_4	0.028124
12	Employment_Info_2	0.021935
74	Medical_History_40	0.021649

Weight	Feature
0.1528 ± 0.0055	BMI
0.0832 ± 0.0064	Medical_History_4
0.0692 ± 0.0034	Medical_History_15
0.0372 ± 0.0016	Product_Info_4
0.0363 ± 0.0016	Medical_Keyword_15
0.0241 ± 0.0023	Medical_Keyword_3
0.0035 ± 0.0007	Medical_History_40
0.0029 ± 0.0012	Ins_Age
0.0027 ± 0.0013	BMI_Age
0.0026 ± 0.0004	Medical_History_32
0.0025 ± 0.0012	Medical_History_30
0.0018 ± 0.0004	Medical_History_28
0.0017 ± 0.0005	Medical_History_39
0.0016 ± 0.0011	Medical_History_23
0.0014 ± 0.0013	Med_Keywords_Count
0.0013 ± 0.0009	Employment_Info_2
0.0011 ± 0.0011	Insurance_History_5
0.0011 ± 0.0004	InsuredInfo_7
0.0009 ± 0.0005	InsuredInfo_5
0.0007 ± 0.0005	Medical_Keyword_23
0.0007 ± 0.0002	Insurance_History_1
0.0006 ± 0.0010	Family_Hist_3
0.0005 ± 0.0003	Medical_Keyword_38
0.0005 ± 0.0004	InsuredInfo_2
0.0005 ± 0.0003	Employment_Info_6
0.0004 ± 0.0007	InsuredInfo_6
0.0004 ± 0.0005	Product_Info_2_num
0.0004 ± 0.0001	Medical_Keyword_41
0.0003 ± 0.0014	Family_Hist_4
0.0003 ± 0.0001	Medical_History_35
0.0003 ± 0.0001	Medical_History_11
0.0003 ± 0.0002	Medical_History_7
0.0003 ± 0.0003	Medical_History_33
0.0003 ± 0.0002	Product_Info_5
0.0002 ± 0.0007	Medical_History_18
0.0002 ± 0.0002	Medical_History_3
0.0002 ± 0.0003	Medical_History_27
0.0002 ± 0.0007	Medical_History_1
0.0002 ± 0.0009	Medical_History_13
0.0002 ± 0.0000	Medical_Keyword_25
0.0002 ± 0.0001	Medical_Keyword_37
0.0001 ± 0.0007	Medical_History_5
0.0001 ± 0.0003	Medical_History_6
0.0001 ± 0.0000	Medical_Keyword_45
0.0001 ± 0.0004	Insurance_History_2
0.0001 ± 0.0004	Product_Info_1
0.0001 ± 0.0001	Insurance_History_8
0.0000 ± 0.0001	Medical_History_14
0.0000 ± 0.0001	Medical_Keyword_13
0.0000 ± 0.0001	Medical_History_16
… 78 more …

	feature	weight	std
0	BMI	0.152766	0.002728
1	Medical_History_4	0.083220	0.003215
2	Medical_History_15	0.069193	0.001709
3	Product_Info_4	0.037248	0.000802
4	Medical_Keyword_15	0.036305	0.000815
5	Medical_Keyword_3	0.024131	0.001142
6	Medical_History_40	0.003536	0.000365
7	Ins_Age	0.002947	0.000623
8	BMI_Age	0.002661	0.000666
9	Medical_History_32	0.002560	0.000189

	BMI_Age	Med_Keywords_Count	Response_1	Response_2	Response_3	Response_4	Response_5	Response_6	Response_7	Response_8
0	0.207304	0	0	0	0	0	0	0	0	1
1	0.016256	0	0	0	0	1	0	0	0	0

	x	display_column	value_lower	value_upper	count	Response_8
0	0	[0, 0.34)	0.000000	0.336002	6597	0.632257
1	1	[0.34, 0.38)	0.336002	0.376807	6436	0.571628
2	2	[0.38, 0.41)	0.376807	0.410593	6753	0.499630
3	3	[0.41, 0.44)	0.410593	0.438952	6592	0.426729
4	4	[0.44, 0.47)	0.438952	0.466858	6488	0.376695
5	5	[0.47, 0.5)	0.466858	0.501185	6704	0.299523
6	6	[0.5, 0.55)	0.501185	0.545946	6590	0.124279
7	7	[0.55, 0.62)	0.545946	0.619419	6581	0.023553
8	8	[0.62, 1]	0.619419	1.000000	6640	0.003916

Table of Contents

Description¶

Imports¶

Load the data¶

Train-test Split with Stratify¶

Modelling xgboost classifier¶

Feature Importances¶

Model Evaluation: using eli5¶

eli5: Permutation Importance show weights¶

eli5: explain weights¶

eli5: show prediction¶

Model Evaluation: using PDP¶

PDP: pdp isolate¶

check prediction distribution¶

partial dependence plot (pdp)¶

Interaction between two variables: bmi and Medical_History_4 with Target¶

prediction distribution through feature combination of 'BMI' and 'Medical_History_4'¶

Model Evaluation: plots using SHAP¶

Get SHAP Values¶

SHAP: Summary Plot¶

SHAP: Force Plot¶

SHAP: Dependence Plot¶

Time Taken¶