Description

In this notebook we will explore the explanability of the binary classifier xgboost classifier.

Imports

In [5]:
import time,os,json
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') # random state
SEED=100
time_start_notebook = time.time()
home = os.path.expanduser('~')
[(x.__name__,x.__version__) for x in [np,pd,sns]]
Out[5]:
[('numpy', '1.18.4'), ('pandas', '1.0.3'), ('seaborn', '0.10.1')]
In [6]:
%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## model evaluation
    !pip install -U watermark
    !pip install -U xgboost
    !pip install -U eli5
    !pip install -U shap
    !pip install -U pdpbox
    !pip install -U yellowbrick
    !pip install -U lime

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.
In [7]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
In [8]:
%load_ext watermark
%watermark -a "Bhishan Poudel" -d
%watermark -v -m -p numpy,scipy,pandas,seaborn,sklearn,xgboost,eli5,shap,pdpbox,yellowbrick -g
Bhishan Poudel 2020-06-22
CPython 3.7.7
IPython 7.13.0

numpy 1.18.4
scipy 1.4.1
pandas 1.0.3
seaborn 0.10.1
sklearn 0.0
xgboost 0.90
eli5 0.10.1
shap 0.35.0
pdpbox 0.2.0
yellowbrick 1.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
Git hash   : 607b934d288205dad9565d6ebad47ae369cfe158

Load the data

In [9]:
def get_data():
    df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
    df = df.copy()
    columns_to_drop = ['Id', 'Medical_History_10','Medical_History_24']
    df = df.drop(columns_to_drop,axis=1)
    df['Product_Info_2_char'] = df.Product_Info_2.str[0]
    df['Product_Info_2_num'] = df.Product_Info_2.str[1]

    # factorize categorical variables
    df['Product_Info_2'] = pd.factorize(df['Product_Info_2'])[0]
    df['Product_Info_2_char'] = pd.factorize(df['Product_Info_2_char'])[0]
    df['Product_Info_2_num'] = pd.factorize(df['Product_Info_2_num'])[0]

    df['BMI_Age'] = df['BMI'] * df['Ins_Age']

    med_keyword_columns = df.columns[df.columns.str.startswith('Medical_Keyword_')]
    df['Med_Keywords_Count'] = df[med_keyword_columns].sum(axis=1)
    df = df.fillna(-1)

    return df

df = get_data()
print(df.shape)
df.isna().sum().sum(), df.sum().sum()
(59381, 129)
Out[9]:
(0, 26897356.818315115)

Train-test Split with Stratify

In [10]:
from sklearn.model_selection import train_test_split

target = 'Response'

df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],
    test_size=0.2, random_state=SEED, stratify=df[target])

ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

features_train = df_Xtrain.columns.to_list()
features_train
Out[10]:
['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_4', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_4', 'Employment_Info_5', 'Employment_Info_6', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'Family_Hist_1', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5', 'Medical_History_1', 'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7', 'Medical_History_8', 'Medical_History_9', 'Medical_History_11', 'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_15', 'Medical_History_16', 'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 'Medical_History_30', 'Medical_History_31', 'Medical_History_32', 'Medical_History_33', 'Medical_History_34', 'Medical_History_35', 'Medical_History_36', 'Medical_History_37', 'Medical_History_38', 'Medical_History_39', 'Medical_History_40', 'Medical_History_41', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keyword_31', 'Medical_Keyword_32', 'Medical_Keyword_33', 'Medical_Keyword_34', 'Medical_Keyword_35', 'Medical_Keyword_36', 'Medical_Keyword_37', 'Medical_Keyword_38', 'Medical_Keyword_39', 'Medical_Keyword_40', 'Medical_Keyword_41', 'Medical_Keyword_42', 'Medical_Keyword_43', 'Medical_Keyword_44', 'Medical_Keyword_45', 'Medical_Keyword_46', 'Medical_Keyword_47', 'Medical_Keyword_48', 'Product_Info_2_char', 'Product_Info_2_num', 'BMI_Age', 'Med_Keywords_Count']

Modelling xgboost classifier

In [11]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb.__version__
Out[11]:
'0.90'
In [12]:
clf_xgb = XGBClassifier(objective= 'multi:softprob', random_state=SEED,n_jobs=-1)
clf_xgb
Out[12]:
XGBClassifier(n_jobs=-1, objective='multi:softprob', random_state=100)
In [13]:
%%time
clf_xgb.fit(df_Xtrain, ser_ytrain)
CPU times: user 3min 18s, sys: 665 ms, total: 3min 19s
Wall time: 3min 36s
Out[13]:
XGBClassifier(n_jobs=-1, objective='multi:softprob', random_state=100)
In [14]:
ypreds = clf_xgb.predict(df_Xtest)

df_ypreds = pd.DataFrame({'ytest': ytest, 'ypreds': ypreds})
df_ypreds['is_Response8'] = df_ypreds['ytest'].eq(8).astype(int)
df_ypreds.head(10)
Out[14]:
ytest ypreds is_Response8
0 1 6 0
1 7 8 0
2 8 8 1
3 6 6 0
4 7 7 0
5 8 6 1
6 6 6 0
7 8 8 1
8 6 6 0
9 4 4 0

Feature Importances

In [15]:
# xgb.plot_importance?

# importance_type = 'weight', 'gain', 'cover'
In [16]:
# feature importance
fig,ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(clf_xgb,ax=ax,max_num_features=10,importance_type='gain')
plt.show()
In [17]:
df_imp = pd.DataFrame({'feature': features_train,
                      'importance': clf_xgb.feature_importances_})

df_imp.sort_values('importance', ascending=False)\
  .head(10).style.background_gradient(subset=['importance'])
Out[17]:
feature importance
58 Medical_History_23 0.122207
90 Medical_Keyword_15 0.080580
40 Medical_History_4 0.076142
10 BMI 0.050464
50 Medical_History_15 0.040969
78 Medical_Keyword_3 0.031732
127 Med_Keywords_Count 0.030037
3 Product_Info_4 0.028124
12 Employment_Info_2 0.021935
74 Medical_History_40 0.021649
In [18]:
df_imp.sort_values('importance', ascending=False)\
  .head(10).style.bar(subset=['importance'],align='mid',color='pink')
Out[18]:
feature importance
58 Medical_History_23 0.122207
90 Medical_Keyword_15 0.080580
40 Medical_History_4 0.076142
10 BMI 0.050464
50 Medical_History_15 0.040969
78 Medical_Keyword_3 0.031732
127 Med_Keywords_Count 0.030037
3 Product_Info_4 0.028124
12 Employment_Info_2 0.021935
74 Medical_History_40 0.021649
In [19]:
from yellowbrick.model_selection import FeatureImportances
In [20]:
# FeatureImportances?
In [21]:
# plt.colormaps()
In [22]:
plt.figure(figsize=(8,24))
viz = FeatureImportances(clf_xgb,colormap='hot_r')
viz.fit(df_Xtrain, ytrain)
viz.show()
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc0940bb550>

Model Evaluation: using eli5

In [23]:
import eli5
from eli5.sklearn import PermutationImportance

eli5: Permutation Importance show weights

In [24]:
perm = PermutationImportance(clf_xgb).fit(df_Xtest, ser_ytest)
In [25]:
eli5.show_weights(perm, feature_names = df_Xtrain.columns.tolist(), top=50)
Out[25]:
Weight Feature
0.1528 ± 0.0055 BMI
0.0832 ± 0.0064 Medical_History_4
0.0692 ± 0.0034 Medical_History_15
0.0372 ± 0.0016 Product_Info_4
0.0363 ± 0.0016 Medical_Keyword_15
0.0241 ± 0.0023 Medical_Keyword_3
0.0035 ± 0.0007 Medical_History_40
0.0029 ± 0.0012 Ins_Age
0.0027 ± 0.0013 BMI_Age
0.0026 ± 0.0004 Medical_History_32
0.0025 ± 0.0012 Medical_History_30
0.0018 ± 0.0004 Medical_History_28
0.0017 ± 0.0005 Medical_History_39
0.0016 ± 0.0011 Medical_History_23
0.0014 ± 0.0013 Med_Keywords_Count
0.0013 ± 0.0009 Employment_Info_2
0.0011 ± 0.0011 Insurance_History_5
0.0011 ± 0.0004 InsuredInfo_7
0.0009 ± 0.0005 InsuredInfo_5
0.0007 ± 0.0005 Medical_Keyword_23
0.0007 ± 0.0002 Insurance_History_1
0.0006 ± 0.0010 Family_Hist_3
0.0005 ± 0.0003 Medical_Keyword_38
0.0005 ± 0.0004 InsuredInfo_2
0.0005 ± 0.0003 Employment_Info_6
0.0004 ± 0.0007 InsuredInfo_6
0.0004 ± 0.0005 Product_Info_2_num
0.0004 ± 0.0001 Medical_Keyword_41
0.0003 ± 0.0014 Family_Hist_4
0.0003 ± 0.0001 Medical_History_35
0.0003 ± 0.0001 Medical_History_11
0.0003 ± 0.0002 Medical_History_7
0.0003 ± 0.0003 Medical_History_33
0.0003 ± 0.0002 Product_Info_5
0.0002 ± 0.0007 Medical_History_18
0.0002 ± 0.0002 Medical_History_3
0.0002 ± 0.0003 Medical_History_27
0.0002 ± 0.0007 Medical_History_1
0.0002 ± 0.0009 Medical_History_13
0.0002 ± 0.0000 Medical_Keyword_25
0.0002 ± 0.0001 Medical_Keyword_37
0.0001 ± 0.0007 Medical_History_5
0.0001 ± 0.0003 Medical_History_6
0.0001 ± 0.0000 Medical_Keyword_45
0.0001 ± 0.0004 Insurance_History_2
0.0001 ± 0.0004 Product_Info_1
0.0001 ± 0.0001 Insurance_History_8
0.0000 ± 0.0001 Medical_History_14
0.0000 ± 0.0001 Medical_Keyword_13
0.0000 ± 0.0001 Medical_History_16
… 78 more …

eli5: explain weights

In [26]:
eli5.explain_weights_df(perm, feature_names=features_train)\
  .head(10).style.background_gradient(subset=['weight'])
Out[26]:
feature weight std
0 BMI 0.152766 0.002728
1 Medical_History_4 0.083220 0.003215
2 Medical_History_15 0.069193 0.001709
3 Product_Info_4 0.037248 0.000802
4 Medical_Keyword_15 0.036305 0.000815
5 Medical_Keyword_3 0.024131 0.001142
6 Medical_History_40 0.003536 0.000365
7 Ins_Age 0.002947 0.000623
8 BMI_Age 0.002661 0.000666
9 Medical_History_32 0.002560 0.000189

eli5: show prediction

In [27]:
eli5.show_prediction(clf_xgb, df_Xtest.iloc[0,:],show_feature_values=True)
Out[27]:
y=1 (probability 0.178, score 0.301) top features y=2 (probability 0.138, score 0.048) top features y=3 (probability 0.006, score -3.172) top features y=4 (probability 0.002, score -4.472) top features y=5 (probability 0.306, score 0.843) top features y=6 (probability 0.312, score 0.863) top features y=7 (probability 0.045, score -1.079) top features y=8 (probability 0.014, score -2.210) top features
Contribution? Feature Value
+0.244 Medical_History_18 2.000
+0.191 Medical_History_28 2.000
+0.131 Insurance_History_5 -1.000
+0.111 Medical_History_1 -1.000
+0.096 Family_Hist_4 -1.000
+0.084 Medical_History_23 1.000
+0.049 BMI_Age 0.274
+0.015 Medical_Keyword_34 0.000
+0.008 Family_Hist_1 3.000
+0.007 Family_Hist_2 -1.000
-0.000 Medical_History_35 1.000
-0.000 Insurance_History_7 3.000
-0.001 Medical_History_17 3.000
-0.001 Medical_History_5 1.000
-0.002 Medical_History_20 2.000
-0.003 Medical_Keyword_48 0.000
-0.003 Medical_History_33 3.000
-0.003 Medical_History_11 3.000
-0.003 Employment_Info_6 0.280
-0.004 Medical_Keyword_38 0.000
-0.007 Medical_History_27 3.000
-0.007 BMI 0.633
-0.009 Medical_History_7 2.000
-0.010 Medical_History_19 1.000
-0.011 Product_Info_2 4.000
-0.012 InsuredInfo_7 1.000
-0.013 InsuredInfo_5 1.000
-0.015 Medical_History_40 3.000
-0.018 Ins_Age 0.433
-0.022 Insurance_History_2 1.000
-0.022 Wt 0.372
-0.022 Product_Info_4 0.245
-0.022 Medical_History_15 -1.000
-0.029 Medical_History_30 2.000
-0.043 InsuredInfo_6 2.000
-0.043 Employment_Info_2 9.000
-0.044 Employment_Info_1 0.070
-0.054 Med_Keywords_Count 1.000
-0.060 Medical_Keyword_3 0.000
-0.073 Medical_History_13 3.000
-0.076 <BIAS> 1.000
Contribution? Feature Value
+0.372 Medical_History_18 2.000
+0.108 Medical_History_23 1.000
+0.060 BMI_Age 0.274
+0.059 Medical_Keyword_35 1.000
+0.020 Insurance_History_1 2.000
+0.014 Product_Info_2_char 0.000
+0.009 Medical_History_3 2.000
+0.007 Ins_Age 0.433
+0.006 Product_Info_2_num 3.000
+0.001 Medical_Keyword_38 0.000
-0.000 Medical_History_27 3.000
-0.001 Medical_History_11 3.000
-0.001 Medical_Keyword_36 0.000
-0.002 Medical_History_31 3.000
-0.002 Medical_History_22 2.000
-0.002 Medical_History_7 2.000
-0.003 InsuredInfo_2 2.000
-0.003 Medical_History_40 3.000
-0.004 Medical_History_4 2.000
-0.004 InsuredInfo_5 1.000
-0.004 Medical_History_6 3.000
-0.010 Medical_History_20 2.000
-0.012 Product_Info_4 0.245
-0.016 Medical_History_5 1.000
-0.017 Insurance_History_2 1.000
-0.030 Medical_History_13 3.000
-0.037 Product_Info_2 4.000
-0.038 Medical_History_39 3.000
-0.040 InsuredInfo_6 2.000
-0.052 <BIAS> 1.000
-0.055 Medical_Keyword_3 0.000
-0.058 Wt 0.372
-0.075 Med_Keywords_Count 1.000
-0.143 BMI 0.633
Contribution? Feature Value
+0.475 BMI 0.633
+0.100 Medical_History_41 1.000
+0.057 Insurance_History_5 -1.000
+0.030 Wt 0.372
+0.021 Medical_History_9 2.000
+0.012 Medical_Keyword_3 0.000
-0.001 Medical_Keyword_31 0.000
-0.024 InsuredInfo_3 8.000
-0.026 Product_Info_2 4.000
-0.038 Family_Hist_5 0.536
-0.049 Medical_History_36 2.000
-0.056 Family_Hist_3 0.578
-0.073 Product_Info_4 0.245
-0.142 BMI_Age 0.274
-1.500 Medical_History_15 -1.000
-1.958 <BIAS> 1.000
Contribution? Feature Value
+0.028 Medical_History_1 -1.000
+0.016 Insurance_History_5 -1.000
+0.013 Product_Info_4 0.245
+0.003 Medical_Keyword_3 0.000
+0.001 Medical_History_30 2.000
+0.001 InsuredInfo_2 2.000
-0.001 Employment_Info_2 9.000
-0.002 InsuredInfo_1 1.000
-0.003 Product_Info_3 26.000
-0.003 Family_Hist_2 -1.000
-0.005 Wt 0.372
-0.005 Ht 0.691
-0.005 Medical_Keyword_16 0.000
-0.009 Product_Info_2_num 3.000
-0.016 Family_Hist_4 -1.000
-0.017 Medical_History_23 1.000
-0.019 Product_Info_2 4.000
-0.019 Med_Keywords_Count 1.000
-0.020 Family_Hist_3 0.578
-0.020 Employment_Info_1 0.070
-0.274 BMI_Age 0.274
-1.192 BMI 0.633
-1.271 Medical_History_15 -1.000
-1.652 <BIAS> 1.000
Contribution? Feature Value
+1.453 BMI 0.633
+0.047 Family_Hist_4 -1.000
+0.044 BMI_Age 0.274
+0.039 Medical_Keyword_3 0.000
+0.031 Medical_History_15 -1.000
+0.030 Ins_Age 0.433
+0.020 Wt 0.372
+0.011 Family_Hist_2 -1.000
+0.010 InsuredInfo_3 8.000
+0.003 Employment_Info_6 0.280
+0.002 Insurance_History_5 -1.000
+0.001 Ht 0.691
+0.001 Medical_History_27 3.000
+0.000 InsuredInfo_2 2.000
+0.000 Employment_Info_4 0.000
-0.000 Product_Info_1 1.000
-0.003 Medical_History_5 1.000
-0.003 Product_Info_5 2.000
-0.004 Medical_History_25 1.000
-0.010 Medical_History_8 2.000
-0.039 Family_Hist_1 3.000
-0.043 Product_Info_4 0.245
-0.073 Family_Hist_5 0.536
-0.103 Medical_History_23 1.000
-0.178 Family_Hist_3 0.578
-0.394 <BIAS> 1.000
Contribution? Feature Value
+0.473 <BIAS> 1.000
+0.466 BMI 0.633
+0.061 Wt 0.372
+0.054 Product_Info_4 0.245
+0.054 Family_Hist_3 0.578
+0.037 BMI_Age 0.274
+0.016 Medical_History_15 -1.000
+0.008 Medical_History_39 3.000
+0.008 Medical_Keyword_3 0.000
+0.005 Product_Info_2 4.000
+0.001 Employment_Info_6 0.280
+0.001 Medical_Keyword_43 0.000
-0.001 InsuredInfo_7 1.000
-0.002 InsuredInfo_2 2.000
-0.007 Employment_Info_4 0.000
-0.010 Medical_Keyword_23 0.000
-0.010 Medical_History_30 2.000
-0.014 Medical_History_32 -1.000
-0.025 Ins_Age 0.433
-0.049 Family_Hist_4 -1.000
-0.202 Medical_History_4 2.000
Contribution? Feature Value
+0.191 Medical_Keyword_35 1.000
+0.174 <BIAS> 1.000
+0.070 Medical_History_4 2.000
+0.030 Medical_Keyword_3 0.000
+0.028 Product_Info_4 0.245
+0.028 Wt 0.372
+0.010 Medical_History_30 2.000
+0.004 Medical_History_39 3.000
+0.003 InsuredInfo_7 1.000
+0.003 Medical_Keyword_38 0.000
+0.003 Medical_History_32 -1.000
+0.001 InsuredInfo_2 2.000
-0.001 Family_Hist_4 -1.000
-0.009 Medical_History_15 -1.000
-0.012 Product_Info_1 1.000
-0.026 Family_Hist_1 3.000
-0.327 Ins_Age 0.433
-1.249 BMI 0.633
Contribution? Feature Value
+1.038 <BIAS> 1.000
+0.087 InsuredInfo_6 2.000
+0.039 Medical_History_4 2.000
+0.034 Medical_Keyword_15 0.000
+0.018 Product_Info_2_num 3.000
+0.018 Medical_Keyword_3 0.000
+0.010 Product_Info_4 0.245
+0.006 Medical_History_30 2.000
+0.005 Family_Hist_3 0.578
+0.004 Medical_History_32 -1.000
+0.004 Medical_History_40 3.000
+0.002 InsuredInfo_2 2.000
+0.002 InsuredInfo_7 1.000
+0.001 Medical_History_20 2.000
-0.003 Medical_Keyword_6 0.000
-0.004 Ht 0.691
-0.004 Employment_Info_4 0.000
-0.016 Family_Hist_5 0.536
-0.019 Family_Hist_4 -1.000
-0.023 Medical_History_15 -1.000
-0.037 Family_Hist_2 -1.000
-0.072 Medical_History_23 1.000
-0.121 Medical_Keyword_35 1.000
-0.177 BMI_Age 0.274
-0.341 Wt 0.372
-2.661 BMI 0.633

Model Evaluation: using PDP

pdp.pdp_isolate(model, dataset, model_features,
feature, num_grid_points=10, 
grid_type='percentile', percentile_range=None,
grid_range=None, cust_grid_points=None, 
memory_limit=0.5, n_jobs=1, predict_kwds=None, 
data_transformer=None)

make sure n_jobs=1 when you are using XGBoost model.
In [28]:
import pdpbox
from pdpbox import pdp
from pdpbox import info_plots

pdpbox.__version__
Out[28]:
'0.2.0'
In [29]:
feature = 'BMI'

PDP: pdp isolate

pdp.pdp_isolate(model, dataset, model_features,
feature, num_grid_points=10,
grid_type='percentile', percentile_range=None, 
grid_range=None, cust_grid_points=None, 
memory_limit=0.5, n_jobs=1, predict_kwds=None,
data_transformer=None)
In [30]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999
In [31]:
# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=clf_xgb,
                dataset=df_Xtest,
                model_features=df_Xtest.columns.tolist(),
                feature=feature,
                n_jobs=1) # make sure n_jobs=1 when you are using XGBoost model.

# plot it
pdp.pdp_plot(pdp_goals, feature)
plt.show()
In [32]:
"""
Assume Respons8 means policy accepted.

Look at class 7 (Response8)

The bmi line is below the red dotted line ==> higher bmi lower chance of policy being granted

0 to 0.4   ==> bmi increse has not much effect
0.4 to 0.6 ==> higher bmi higher rejection
0.6+       ==> the value saturates.

""";
In [33]:
np.unique(df[target])
Out[33]:
array([1, 2, 3, 4, 5, 6, 7, 8])
In [34]:
"""
Our target names are 1-8, but default pdp-box class names are 0-7.

""";
In [35]:
df_target_encoded = pd.get_dummies(df,columns=[target],drop_first=False)
df_target_encoded.iloc[:2,-10:]
Out[35]:
BMI_Age Med_Keywords_Count Response_1 Response_2 Response_3 Response_4 Response_5 Response_6 Response_7 Response_8
0 0.207304 0 0 0 0 0 0 0 0 1
1 0.016256 0 0 0 0 1 0 0 0 0
In [36]:
features_train = df.columns.drop(target)
features_train
Out[36]:
Index(['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_4',
       'Product_Info_5', 'Product_Info_6', 'Product_Info_7', 'Ins_Age', 'Ht',
       'Wt',
       ...
       'Medical_Keyword_43', 'Medical_Keyword_44', 'Medical_Keyword_45',
       'Medical_Keyword_46', 'Medical_Keyword_47', 'Medical_Keyword_48',
       'Product_Info_2_char', 'Product_Info_2_num', 'BMI_Age',
       'Med_Keywords_Count'],
      dtype='object', length=128)
In [37]:
target_cols = [f'Response_{i}' for i in range(1,9)]
target_cols
Out[37]:
['Response_1', 'Response_2', 'Response_3', 'Response_4', 'Response_5', 'Response_6', 'Response_7', 'Response_8']
In [38]:
fig, axes, summary_df = info_plots.target_plot(
    df=df_target_encoded,
    feature=feature,
    feature_name=feature, 
    target=['Response_8']
)

# we can see when bmi increases, then average response8 decreases.
# high bmi ==> low response8 ==> low acceptance.
In [39]:
summary_df
Out[39]:
x display_column value_lower value_upper count Response_8
0 0 [0, 0.34) 0.000000 0.336002 6597 0.632257
1 1 [0.34, 0.38) 0.336002 0.376807 6436 0.571628
2 2 [0.38, 0.41) 0.376807 0.410593 6753 0.499630
3 3 [0.41, 0.44) 0.410593 0.438952 6592 0.426729
4 4 [0.44, 0.47) 0.438952 0.466858 6488 0.376695
5 5 [0.47, 0.5) 0.466858 0.501185 6704 0.299523
6 6 [0.5, 0.55) 0.501185 0.545946 6590 0.124279
7 7 [0.55, 0.62) 0.545946 0.619419 6581 0.023553
8 8 [0.62, 1] 0.619419 1.000000 6640 0.003916

check prediction distribution

info_plots.actual_plot(model, X, feature,
feature_name, num_grid_points=10,
grid_type='percentile', percentile_range=None,
grid_range=None, cust_grid_points=None,
show_percentile=False, show_outliers=False,
endpoint=True, which_classes=None,
predict_kwds=None, ncols=2, figsize=None, 
plot_params=None)

Parameters
----------

model: a fitted sklearn model
X: pandas DataFrame
    data set on which the model is trained
which_classes: list, optional, default=None
    which classes to plot, only use when it is a multi-class problem
In [40]:
fig, axes, summary_df = info_plots.actual_plot(
    model=clf_xgb, 
    X=df_Xtrain, 
    feature=feature, 
    feature_name=feature, 
    which_classes=[0],
    predict_kwds={},
)

partial dependence plot (pdp)

In [41]:
%%time 
pdp_bmi_xgboost = pdp.pdp_isolate(
    model=clf_xgb,
    dataset=df_target_encoded, 
    model_features=features_train, 
    feature=feature
)
CPU times: user 22.5 s, sys: 312 ms, total: 22.8 s
Wall time: 23.5 s
In [42]:
fig, axes = pdp.pdp_plot(
    pdp_isolate_out=pdp_bmi_xgboost, 
    feature_name=feature, 
    center=True, 
    x_quantile=True, 
    ncols=3, 
    plot_lines=True, 
    frac_to_plot=100
)
In [43]:
fig, axes = pdp.pdp_plot(
    pdp_bmi_xgboost, 
    feature, 
    center=True, 
    x_quantile=True, 
    ncols=1, 
    plot_lines=True, 
    frac_to_plot=100,
    which_classes=[7], 
    plot_pts_dist=True
)
In [44]:
"""
Important Observation:

- All the bmi bins have sufficient persons. Certain range of bmi is NOT missing.
- bmi curve is below the red dotted line ==> negative impact with prediction
- bmi increase upto 0.44 has almost no impact.
- bmi increase 0.44 - 0.55 ==> less policy accepted
- bmi increase 0.55+       ==> saturates and still less policy accepted.

""";

Interaction between two variables: bmi and Medical_History_4 with Target

In [45]:
df.iloc[:2,-10:]
Out[45]:
Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48 Response Product_Info_2_char Product_Info_2_num BMI_Age Med_Keywords_Count
0 0 0 0 0 0 8 0 0 0.207304 0
1 0 0 0 0 0 4 1 1 0.016256 0
In [49]:
two_features = ['BMI', 'Medical_History_4']
In [58]:
fig, axes, summary_df = info_plots.target_plot_interact(
    df=df_target_encoded,
    features=two_features,
    feature_names=two_features, 
    target='Response_8'
)

prediction distribution through feature combination of 'BMI' and 'Medical_History_4'

In [59]:
fig, axes, summary_df = info_plots.actual_plot_interact(
    model=clf_xgb, 
    X=df_Xtrain, 
    features=two_features, 
    feature_names=two_features,
    ncols=1,
    which_classes=[7]
)

Model Evaluation: plots using SHAP

SHAP = SHapley Additive exPlanations

In [60]:
import shap
shap.__version__
Out[60]:
'0.35.0'

Get SHAP Values

In [61]:
explainer = shap.TreeExplainer(clf_xgb)
shap_values = explainer.shap_values(df_Xtest)
In [62]:
type(shap_values), type(explainer.expected_value), type(shap_values[0])
Out[62]:
(list, list, numpy.ndarray)
In [63]:
np.array(shap_values).shape
Out[63]:
(8, 11877, 128)
In [64]:
 np.array(explainer.expected_value).shape
Out[64]:
(8,)
In [65]:
df_Xtest.shape
Out[65]:
(11877, 128)

SHAP: Summary Plot

In [66]:
cmap = plt.get_cmap("tab10")
colors = cmap.colors # tuple of tuples
# colors = sns.color_palette('husl',8)
# colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)] # etc

# get class ordering from shap values
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])

# create listed colormap
from matplotlib import colors as plt_colors
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
In [67]:
shap.summary_plot(shap_values, df_Xtest,
        class_names = [f'Response_{i+1}' for i in range(8)],
        color=cmap,
        plot_type="bar")
In [68]:
targetNum = 7
title = "  "*20 + f"SHAP plot for Response {targetNum+1}"
print(title)
shap.summary_plot(shap_values[targetNum], df_Xtest,
        title = title, # title dont work.
        class_names = [f'Response_{targetNum+1}'],
        color=colors[targetNum], # NOTE: colors[0] not colors[1] for Response_1
        plot_type="bar",
        plot_size = (12,8)
        )

"""
Note: By default the colors in multiclass shap summary plot
are determined by mean of shap values.

They are NOT in the same order given in cmap.colors.

""";
                                        SHAP plot for Response 8
In [69]:
shap.summary_plot(shap_values[targetNum], df_Xtest)
In [70]:
"""
Important Observations:

Lets assume that response_8 is the policy grant and other response(1-7) are reject.

1. Features are sorted in descending order of its importance.
2. BMI has High (red in colour) and negative (less than 0) effect on the target.
   This means higher the BMI, higher the rejection.
3. Conversely, Med Hist 4 has High (red) and positive (greater than 0)
   effect on the target. 
   This means that the higher the value of Med Hist 4, 
   the chances are higher for policy getting accepted.

""";
In [71]:
# shap.force_plot(explainer.expected_value[0],
#                 shap_values[0],
#                 matplotlib=False,
#                 text_rotation=90)

# this is too slow, we can use only first 1000 rows.
In [72]:
shap_values[0].shape
Out[72]:
(11877, 128)
In [73]:
x = shap_values[0][:100,:]
x.shape
Out[73]:
(100, 128)
In [74]:
explainer.expected_value[7]
Out[74]:
1.5381206
In [75]:
# target = Response_1
# row = only first few rows
targetNum = 7 # 7 is Response_8
shap.initjs()
shap.force_plot(explainer.expected_value[targetNum],
                shap_values[targetNum][:100,:], # take only first N rows
                feature_names = features_train,
                matplotlib=False,
                text_rotation=90)
Out[75]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [76]:
# target = Response_1
# row = only one row (only one customer)
shap.initjs()
rowNum = 0
targetNum = 7 # 7 is Response_8

print(f'Average shap for the group: {explainer.expected_value[targetNum]}')

shap.force_plot(explainer.expected_value[targetNum],
                shap_values[targetNum][rowNum,:],
                feature_names = features_train,
                matplotlib=True,
                text_rotation=90 # only works when matplotlib=True
                )
Average shap for the group: 1.5381206274032593
In [77]:
"""
We can visualize feature attributions such as Shapley values as "forces". 
Each feature value is a force that either increases or decreases the prediction.
The prediction starts from the baseline. 
The baseline for Shapley values is the average of all predictions. 
In the plot, each Shapley value is an arrow that pushes to increase 
(positive value) or decrease (negative value) the prediction.
These forces balance each other out at the actual prediction of the data instance.


The baseline -- the average predicted probability -- is 1.53.
The person has prediction =  -1.71 < baseline
The person's application likely NOT to be accepted.

MH = medical history
MK = medical keyword

From summary_plot:
higher is better:  MH_4_23_33_40
lower is better :  MH_30_32 MK_13_3 Insured_info_2_7 bmi wt bmi_wt

Although medical_history_23 is high for this particular person, the features
such as wt bmi_age are high and they offset the goodness of high
medical_history_23 and overall result is bad.

The persons application is not likely to be accepted.
""";
In [78]:
df_ypreds.head(6)
Out[78]:
ytest ypreds is_Response8
0 1 6 0
1 7 8 0
2 8 8 1
3 6 6 0
4 7 7 0
5 8 6 1
In [79]:
# target = Response_1
# row = only one row (only one customer)
shap.initjs()
rowNum = 2
targetNum = 7 # 7 is Response_8

print(f'Average shap for the group: {explainer.expected_value[targetNum]}')

shap.force_plot(explainer.expected_value[targetNum],
                shap_values[targetNum][rowNum,:],
                feature_names = features_train,
                matplotlib=True,
                text_rotation=90 # only works when matplotlib=True
                )
Average shap for the group: 1.5381206274032593
In [80]:
"""

The baseline -- the average predicted probability -- is 1.53.
The person has prediction =  1.86
which is much higher than the baseline 1.53.

MH = medical history
MK = medical keyword

From summary_plot:
higher is better:  MH_4_23_33_40
lower is better :  MH_30_32 MK_13_3 Insured_info_2_7 bmi wt bmi_wt

Effect is GOOD. The person's application is likely to be accepted.

""";

SHAP: Dependence Plot

In [81]:
shap.dependence_plot(feature, shap_values[0], df_Xtest)
In [82]:
# features_train
In [83]:
shap.dependence_plot(feature, shap_values[0], df_Xtest,
                    interaction_index='Med_Keywords_Count')

Time Taken

In [84]:
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 15 min 8 secs
In [ ]: