%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    #!pip install hpsklearn
    !pip install shap eli5 lime scikit-plot watermark
    !pip install optuna hyperopt
    !pip install catboost
    !pip install ipywidgets
    !pip install -U scikit-learn
    !jupyter nbextension enable --py widgetsnbextension

    # create project like folders
    !mkdir -p ../outputs ../images ../reports ../html ../models

    print('Environment: Google Colab')


import time

notebook_start_time = time.time()


import numpy as np
import pandas as pd

pd.set_option('max_columns',100)

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# visualizatioin
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)

# six and pickle
import six
import pickle
import joblib

# mixed
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)

# sklearn
import sklearn

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# sklearn scalar metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# boosting
import xgboost, lightgbm, catboost
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier, DMatrix
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier, Pool, CatBoost

# parameters tuning
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample

# model intepretation modules
import eli5
import shap
import yellowbrick
import lime
import scikitplot

# version
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2021-08-08 

CPython 3.7.7
IPython 7.22.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

joblib      1.0.1
sklearn     0.23.1
seaborn     0.11.0
shap        0.39.0
xgboost     1.2.0
catboost    0.23.2
eli5        0.10.1
autopep8    1.5.2
six         1.15.0
lightgbm    2.3.1
yellowbrick 1.1
json        2.0.9
scikitplot  0.3.7
pandas      1.3.0
numpy       1.19.5


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


def model_evaluation(model_name, desc, ser_ytrain, trprobs1d,
                     df_eval=None,threshold=0.5,
                     show=True,col_sort='Recall'):
    if df_eval is None:
        df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC':[],
                        'AUCPR':[],
                    })

    ytr = np.array(ser_ytrain).flatten()
    prec,rec,thr = sklearn.metrics.precision_recall_curve(ytr,trprobs1d)
    auc_pr = sklearn.metrics.auc(rec,prec)

    # model evaluation
    average = 'binary'
    row_eval = [model_name,desc, 
                sklearn.metrics.accuracy_score(ytr, trpreds),
                sklearn.metrics.precision_score(ytr, trpreds, average=average),
                sklearn.metrics.recall_score(ytr, trpreds, average=average),
                sklearn.metrics.f1_score(ytr, trpreds, average=average),
                sklearn.metrics.roc_auc_score(ytr, trprobs1d), # auc need probs
                auc_pr
                ]

    df_eval.loc[len(df_eval)] = row_eval
    df_eval = df_eval.drop_duplicates()
    df_eval = df_eval.sort_values(col_sort,ascending=False)

    if show:
        display(df_eval.style.background_gradient(subset=[col_sort]))

    return df_eval

df_eval = None


ifile = 'https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()

(284807, 31)


target = 'Class'
features = df.columns.drop(target)
df[target].value_counts(normalize=True)*100

0    99.827251
1     0.172749
Name: Class, dtype: float64


sns.countplot(x=df[target])

<matplotlib.axes._subplots.AxesSubplot at 0x7fb9ed422e50>


from sklearn.model_selection import train_test_split

df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
    df.drop(target,axis=1), 
    df[target],
    test_size=0.2, 
    random_state=SEED, 
    stratify=df[target])

ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

print(df_Xtrain_orig.shape)
df_Xtrain_orig.head()

(227845, 30)


df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_orig, 
    ser_ytrain_orig,
    test_size=0.2, 
    random_state=SEED, 
    stratify=ser_ytrain_orig)

ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()

print(df_Xtrain.shape)

(182276, 30)


import catboost
bp.show_methods(catboost,2)


from catboost import CatBoostClassifier, Pool

bp.show_methods(CatBoostClassifier,2)


from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score,  precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn import model_selection

# time
time_start = time.time()

# current parameters
desc = 'default,random_state=100, cross_validation_ypreds'

Xtr = df_Xtrain.to_numpy()
ytr = ser_ytrain.to_numpy().ravel()
Xtx = df_Xtest.to_numpy()
ytx = ser_ytest.to_numpy().ravel()

# fit the model
model = CatBoostClassifier(verbose=100,random_state=SEED)
model.fit(Xtr, ytr)

# save the model
# joblib.dump(model_cat, 'model_cat.pkl')
# model_cat = joblib.load('model_cat.pkl')

# predictions
skf = model_selection.StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
trprobs_cv = model_selection.cross_val_predict(model, df_Xtrain, ser_ytrain,
                               cv=skf,method='predict_proba')
trprobs1d = trprobs_cv[:,1] # take 2nd column for probability

threshold = 0.5
trpreds = (trprobs1d>threshold).astype(np.int8)

# model evaluation
df_eval = model_evaluation('catboost', desc, ytr,trprobs1d,df_eval=df_eval)

time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)

Learning rate set to 0.095119
0:	learn: 0.3780861	total: 124ms	remaining: 2m 4s
100:	learn: 0.0016198	total: 5.19s	remaining: 46.2s
200:	learn: 0.0010673	total: 10.3s	remaining: 40.9s
300:	learn: 0.0008012	total: 15.5s	remaining: 35.9s
400:	learn: 0.0005683	total: 20.5s	remaining: 30.6s
500:	learn: 0.0003989	total: 25.6s	remaining: 25.5s
600:	learn: 0.0002876	total: 30.8s	remaining: 20.4s
700:	learn: 0.0002030	total: 35.8s	remaining: 15.3s
800:	learn: 0.0001605	total: 40.8s	remaining: 10.1s
900:	learn: 0.0001330	total: 46.1s	remaining: 5.06s
999:	learn: 0.0001102	total: 51s	remaining: 0us
Learning rate set to 0.07075
0:	learn: 0.4576720	total: 63.9ms	remaining: 1m 3s
100:	learn: 0.0014048	total: 3.19s	remaining: 28.4s
200:	learn: 0.0007900	total: 6.25s	remaining: 24.9s
300:	learn: 0.0004575	total: 9.32s	remaining: 21.6s
400:	learn: 0.0002501	total: 12.4s	remaining: 18.5s
500:	learn: 0.0001718	total: 15.5s	remaining: 15.4s
600:	learn: 0.0001308	total: 18.5s	remaining: 12.3s
700:	learn: 0.0001045	total: 21.7s	remaining: 9.26s
800:	learn: 0.0000867	total: 24.8s	remaining: 6.15s
900:	learn: 0.0000768	total: 27.8s	remaining: 3.05s
999:	learn: 0.0000661	total: 30.8s	remaining: 0us
Learning rate set to 0.07075
0:	learn: 0.4631036	total: 33.1ms	remaining: 33s
100:	learn: 0.0018651	total: 3.08s	remaining: 27.4s
200:	learn: 0.0011770	total: 6.14s	remaining: 24.4s
300:	learn: 0.0008368	total: 9.23s	remaining: 21.4s
400:	learn: 0.0005437	total: 13s	remaining: 19.4s
500:	learn: 0.0003737	total: 16s	remaining: 16s
600:	learn: 0.0002749	total: 19.1s	remaining: 12.7s
700:	learn: 0.0002123	total: 22.5s	remaining: 9.6s
800:	learn: 0.0001767	total: 25.6s	remaining: 6.35s
900:	learn: 0.0001466	total: 28.6s	remaining: 3.14s
999:	learn: 0.0001254	total: 31.6s	remaining: 0us

Time taken: 1 min 55 secs


%%time
model = CatBoostClassifier(verbose=100,random_state=SEED)
model.fit(Xtr, ytr)
joblib.dump(model, '../models/model_cat_default_seed100.joblib')

ypreds = model.predict(Xtx)
cm = sklearn.metrics.confusion_matrix(ytx,ypreds)
print('confusion matrix\n',cm)

Learning rate set to 0.095119
0:	learn: 0.3780861	total: 62.7ms	remaining: 1m 2s
100:	learn: 0.0016198	total: 5.1s	remaining: 45.4s
200:	learn: 0.0010673	total: 10.1s	remaining: 40.2s
300:	learn: 0.0008012	total: 15.3s	remaining: 35.5s
400:	learn: 0.0005683	total: 21.8s	remaining: 32.6s
500:	learn: 0.0003989	total: 30.9s	remaining: 30.8s
600:	learn: 0.0002876	total: 35.9s	remaining: 23.8s
700:	learn: 0.0002030	total: 40.8s	remaining: 17.4s
800:	learn: 0.0001605	total: 45.7s	remaining: 11.4s
900:	learn: 0.0001330	total: 51s	remaining: 5.6s
999:	learn: 0.0001102	total: 58s	remaining: 0us
confusion matrix
 [[56859     5]
 [   24    74]]
CPU times: user 1min 56s, sys: 16 s, total: 2min 12s
Wall time: 58.7 s


yprobs = model.predict_proba(Xtx)
print(yprobs[:5])

[[9.99998192e-01 1.80796544e-06]
 [9.99932577e-01 6.74226697e-05]
 [9.99998670e-01 1.32969613e-06]
 [9.99996711e-01 3.28938804e-06]
 [9.99994773e-01 5.22686104e-06]]


from scikitplot import metrics as skpmetrics

skpmetrics.plot_confusion_matrix(ytx, ypreds)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb9ede02c50>


fig, ax = plt.subplots(figsize=(12,8))
skpmetrics.plot_roc(ytx,yprobs,ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb9ee789610>


import eli5

# eli5.explain_weights_catboost(model) # same thing
eli5.show_weights(model)


df_Xtrain.head(2)


# # time
# time_start = time.time()

# # current parameters
# Xtr = df_Xtrain
# ytr = ser_ytrain.to_numpy().ravel()
# Xtx = df_Xtest
# ytx = ser_ytest.to_numpy().ravel()
# Xvd = df_Xvalid
# yvd = ser_yvalid.to_numpy().ravel()

# # fit the model
# model = CatBoostClassifier(random_state=0,verbose=100)
# model.fit(Xtr, ytr,
#           eval_set=(Xvd, yvd))

# # ypreds
# ypreds = model.predict(Xtx)

# # r-squared values
# auc = roc_auc_score(ytx, ypreds)

# # time
# time_taken = time.time() - time_start
# print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
# print('ROC AUC Score ', auc)


df_Xtrain.head(2)


model = CatBoostClassifier(verbose=100,random_state=SEED)
model.fit(Xtr, ytr)

# float feature
feature_name = 'Amount'
dict_stats = model.calc_feature_statistics(df_Xtrain, ser_ytrain, feature_name, plot=True)

Learning rate set to 0.095119
0:	learn: 0.3780861	total: 55.2ms	remaining: 55.1s
100:	learn: 0.0016198	total: 5.37s	remaining: 47.8s
200:	learn: 0.0010673	total: 10.5s	remaining: 41.7s
300:	learn: 0.0008012	total: 15.7s	remaining: 36.4s
400:	learn: 0.0005683	total: 20.8s	remaining: 31s
500:	learn: 0.0003989	total: 25.8s	remaining: 25.7s
600:	learn: 0.0002876	total: 35.2s	remaining: 23.4s
700:	learn: 0.0002030	total: 40.5s	remaining: 17.3s
800:	learn: 0.0001605	total: 47.2s	remaining: 11.7s
900:	learn: 0.0001330	total: 59.5s	remaining: 6.54s
999:	learn: 0.0001102	total: 1m 6s	remaining: 0us

---------------------------------------------------------------------------
CatBoostError                             Traceback (most recent call last)
<ipython-input-26-58d6b98a903e> in <module>
      4 # float feature
      5 feature_name = 'Amount'
----> 6 dict_stats = model.calc_feature_statistics(df_Xtrain, ser_ytrain, feature_name, plot=True)

~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/catboost/core.py in calc_feature_statistics(self, data, target, feature, prediction_type, cat_feature_values, plot, max_cat_features_on_plot, thread_count, plot_file)
   3004             if not isinstance(feature, int):
   3005                 if self.feature_names_ is None or feature not in self.feature_names_:
-> 3006                     raise CatBoostError('No feature named "{}" in model'.format(feature))
   3007                 feature_num = self.feature_names_.index(feature)
   3008             else:

CatBoostError: No feature named "Amount" in model


# feature importance
df_imp = pd.DataFrame({'Feature': features,
                       'Importance': model.feature_importances_
                       }) 

df_imp.sort_values('Importance',ascending=False).style.background_gradient()


def plot_feature_imp_catboost(model_catboost,features):
    """Plot the feature importance horizontal bar plot.
    
    """

    df_imp = pd.DataFrame({'Feature': model.feature_names_,
                        'Importance': model.feature_importances_
                        }) 

    df_imp = df_imp.sort_values('Importance').set_index('Feature')
    ax = df_imp.plot.barh(figsize=(12,8))

    plt.grid(True)
    plt.title('Feature Importance',fontsize=14)
    ax.get_legend().remove()

    for p in ax.patches:
        x = p.get_width()
        y = p.get_y()
        text = '{:.2f}'.format(p.get_width())
        ax.text(x, y,text,fontsize=15,color='indigo')

    plt.show()

plot_feature_imp_catboost(model, features)


df_fimp = model.get_feature_importance(prettified=True)
df_fimp.head()


plt.figure(figsize=(12,8))
ax = sns.barplot(x=df_fimp.columns[1], y=df_fimp.columns[0], data=df_fimp);

for p in ax.patches:
    x = p.get_width()
    y = p.get_y()
    text = '{:.2f}'.format(p.get_width())
    ax.text(x, y,text,fontsize=15,color='indigo',va='top',ha='left')


from catboost import CatBoost, Pool


# help(CatBoost)


cat_features = [] # take it empty for the moment
dtrain = Pool(df_Xtrain, ser_ytrain, cat_features=cat_features)
dvalid = Pool(df_Xvalid, ser_yvalid, cat_features=cat_features)
dtest = Pool(df_Xtest, ser_ytest, cat_features=cat_features)


params_cat = {'iterations': 100, 'verbose': False, 
          'random_seed': 0,
          'eval_metric':'AUC',
          'loss_function':'Logloss',
          'cat_features': [],
          'ignored_features': [],
          'early_stopping_rounds': 200,
          'verbose': 200,
          }

bst_cat = CatBoost(params=params_cat)

bst_cat.fit(dtrain,           
            eval_set=(df_Xvalid, ser_yvalid), 
          use_best_model=True,
          plot=True);

print(bst_cat.eval_metrics(dtest, ['AUC'])['AUC'][-1])


params = {'iterations': 100, 'verbose': False,
          'random_seed': 0,
          'loss_function':'Logloss',
          'eval_metric':'AUC',
          }

df_scores = catboost.cv(dtrain,
            params,
            fold_count=2,
            verbose=100,
            shuffle=True,
            stratified=True,
            plot="True") # plot does not work in google colab


print(df_scores.columns)
df_scores.head()


sns.lineplot(x='iterations',y='train-Logloss-mean',data=df_scores,ax=ax,color='r')
sns.lineplot(x='iterations',y='test-Logloss-mean',data=df_scores,ax=ax,
             color='b',alpha=0.2,linewidth=5,linestyle='--')

plt.show()


model = joblib.load('../models/model_cat_default_seed100.joblib')
ypreds = model.predict(df_Xtest)
cm = confusion_matrix(ytest, ypreds)
print(cm)


%%time
params = dict(verbose=500,
              random_state=0,
              iterations=3_000,
              eval_metric='AUC',
              cat_features = [],
              early_stopping_rounds=200,
            )

model = catboost.CatBoostClassifier(**params)

model.fit(df_Xtrain, ytrain, 
          eval_set=(df_Xvalid, yvalid), 
          use_best_model=True, 
          plot=False
         );

# now use the best iteration
best_iter = model.get_best_iteration()

model = CatBoostClassifier(verbose=False,random_state=0,iterations=best_iter)
model.fit(df_Xtrain, ser_ytrain)
joblib.dump(model, '../models/model_cat_earlystopping.joblib')


ypreds = model.predict(df_Xtest)

cm = confusion_matrix(ytest, ypreds)
print(cm)

desc = f'early stopping, iterations={best_iter}'
df_eval = model_evaluation('catboost', desc, ytx,ypreds,df_eval=df_eval)

# using best iterations is worse, use previous 1000.


# for n in [6]: # default detpth = 6

#     model = CatBoostClassifier(verbose=False,random_state=0,
#                               iterations=1_000,
#                               depth=n,
#                               )
#     model.fit(Xtr, ytr)
#     ypreds = model.predict(Xtx)
#     cm = confusion_matrix(ytest, ypreds)
#     error = cm[0,1] + cm[1,0]
#     print(f'Confusion matrix error count = {error} for n = {n}')


# for n in [0]: 

#     model = CatBoostClassifier(verbose=False,random_state=n,
#                                depth=6,
#                               iterations=1_000,
#                               )
#     model.fit(Xtr, ytr)
#     ypreds = model.predict(Xtx)
#     cm = confusion_matrix(ytest, ypreds)
#     error = cm[0,1] + cm[1,0]
#     print(f'Confusion matrix error count = {error} for n = {n}')


import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress


def objective(trial):

    params_cat_optuna = {
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 1, 12),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type',
                                                    ['Bayesian', 'Bernoulli', 'MVS']),
        'used_ram_limit': '3gb'
    }

    # update parameters
    if params_cat_optuna['bootstrap_type'] == 'Bayesian':
        params_cat_optuna['bagging_temperature'] = trial.suggest_uniform('bagging_temperature', 0, 10)
    elif params_cat_optuna['bootstrap_type'] == 'Bernoulli':
        params_cat_optuna['subsample'] = trial.suggest_uniform('subsample', 0.1, 1)
        
    # fit the model
    model = CatBoostClassifier(random_state=SEED,**params_cat_optuna)
    model.fit(df_Xtrain, ser_ytrain,
            eval_set=[(df_Xvalid, ser_yvalid)],
            verbose=0,
            early_stopping_rounds=100)
    
    ypreds = model.predict(df_Xvalid)
    ypreds = np.rint(ypreds)
    score = roc_auc_score(ser_yvalid.to_numpy().ravel(),
                              ypreds)
    return score


# NOTE: there is inherent non-determinism in optuna hyperparameter selection
#       we may not get the same hyperparameters when run twice.


sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large

study = optuna.create_study(direction='maximize',
                            sampler=sampler,
                            study_name='cat_optuna',
                            storage='sqlite:///cat_optuna_fraud_detection.db',
                            load_if_exists=True)

study.optimize(objective, n_trials=N_TRIALS,timeout=600)


# Resume from last time
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large

study = optuna.create_study(direction='maximize',
                            sampler=sampler,
                            study_name='cat_optuna',
                            storage='sqlite:///cat_optuna_fraud_detection.db',
                            load_if_exists=True)

# study.optimize(objective, n_trials=N_TRIALS)


print(f'Number of finished trials: {len(study.trials)}')

# best trail
best_trial = study.best_trial

# best params
params_best = study.best_trial.params
params_best


%%time

model_name = 'catboost'
desc = 'grid search optuna'
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()

# use best model
params_best =  study.best_trial.params

clf = CatBoostClassifier(random_state=SEED,verbose=False)
clf.set_params(**params_best)

# fit and save the model
clf.fit(Xtr, ytr)
joblib.dump(clf,'../models/clf_cat_grid_search_optuna.pkl')

# load the saved model
clf = joblib.load('../models/clf_cat_grid_search_optuna.pkl')

# predictions
ypreds = clf.predict(Xtx)

# model evaluation
cm = confusion_matrix(ytx, ypreds)
print(cm)

desc = f'grid search optuna'
df_eval = model_evaluation('catboost', desc, ytx,ypreds,df_eval=df_eval)


%%time
model = CatBoostClassifier(verbose=False,random_state=100,
                            depth=6,
                            iterations=1_000,
                            )
model.fit(Xtr, ytr)
joblib.dump(model, '../models/model_cat_best.joblib')

ypreds = model.predict(Xtx)
cm = confusion_matrix(ytest, ypreds)

print(cm)
df_eval = model_evaluation('catboost', 'seed=100,depth=6,iter=1k', ytest, ypreds,df_eval=df_eval)


df_Xtrain.head(2).append(df_Xtest.head(2))


import eli5

eli5.show_weights(model)


from eli5.sklearn import PermutationImportance

feature_names = df_Xtrain.columns.tolist()

perm = PermutationImportance(model).fit(df_Xtest, ytx)
eli5.show_weights(perm, feature_names=feature_names)


import lime
import lime.lime_tabular


idx = 0
example = df_Xtest.iloc[idx]
answer = ser_ytest.iloc[idx]
feature_names = df_Xtest.columns.tolist()

prediction = model.predict(example.to_numpy().reshape(-1,1).T)


print(f'answer     = {answer}')
print('prediction = ', prediction[0])
print()
print(example)
print(feature_names)


import lime
import lime.lime_tabular

categorical_features = []
categorical_features_idx = [df_Xtrain.columns.get_loc(col) for col in categorical_features]


explainer = lime.lime_tabular.LimeTabularExplainer(df_Xtrain.to_numpy(), 
               feature_names=feature_names, 
               class_names=['Not-fraud','Fraud'], 
               categorical_features=categorical_features_idx, 
               mode='classification')

exp = explainer.explain_instance(example, model.predict_proba, num_features=8)
exp.show_in_notebook(show_table=True)


exp.as_pyplot_figure(); # use semicolon


import shap
shap.initjs()


# model = CatBoostClassifier(verbose=100,random_state=100)
# model.fit(df_Xtrain, ytrain)
model = joblib.load('../models/model_cat_best.joblib')

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)


df_Xtest.head(1)


df_Xtest.head(1)['V15 V18 V3 V24 V1 V8 V4 V14 V2 V6 V9 V20'.split()].round(4)


# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
                shap_values[idx,:],
                df_Xtest.iloc[idx,:],
                matplotlib=False,
                text_rotation=90)

# for this row, the predicted label is -9.33
# red features makes it higher
# blue features makes it smaller.


NUM = 100
shap.force_plot(explainer.expected_value, shap_values[:NUM,:],
                df_Xtest.iloc[:NUM,:],matplotlib=False)


shap.summary_plot(shap_values, df_Xtest)


shap.summary_plot(shap_values, df_Xtest, plot_type='bar')


shap.dependence_plot("Amount", shap_values, df_Xtest)


shap.dependence_plot(ind='Time', interaction_index='Amount',
                     shap_values=shap_values, 
                     features=df_Xtest,  
                     display_features=df_Xtest)


notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
36001	38355.0	1.043949	0.318555	1.045810	2.805989	-0.561113	-0.367956	0.032736	-0.042333	-0.322674	...	-0.084556	-0.240105	-0.680315	0.085328	0.684812	0.318620	-0.204963	0.001662	0.037894	49.67
12844	22555.0	-1.665159	0.808440	1.805627	1.903416	-0.821627	0.934790	-0.824802	0.975890	1.747469	...	-0.373759	-0.335332	-0.510994	0.035839	0.147565	-0.529358	-0.566950	-0.595998	-0.220086	16.94
2873	2431.0	-0.324096	0.601836	0.865329	-2.138000	0.294663	-1.251553	1.072114	-0.334896	1.071268	...	-0.039868	0.012220	0.352856	-0.341505	-0.145791	0.094194	-0.804026	0.229428	-0.021623	1.00
145263	86773.0	-0.258270	1.217501	-0.585348	-0.875347	1.222481	-0.311027	1.073860	-0.161408	0.200665	...	0.382305	-0.424626	-0.781158	0.019316	0.178614	-0.315616	0.096665	0.269740	-0.020635	10.78
186658	127202.0	2.142162	-0.494988	-1.936511	-0.818288	-0.025213	-1.027245	-0.151627	-0.305750	-0.869482	...	0.106592	0.010115	0.021722	0.079463	-0.480899	0.023846	-0.279076	-0.030121	-0.043888	39.96

	0	1
0	CatBoost	Pool
1	CatBoostClassifier	core
2	CatBoostError	cv
3	CatBoostRegressor	sum_models
4	CatboostError	to_classifier
5	EFstrType	to_regressor
6	FeaturesData	train
7	MetricVisualizer	version
8	MultiRegressionCustomMetric	widget
9	MultiRegressionCustomObjective

	0	1
0	best_iteration_	get_test_evals
1	best_score_	get_text_feature_indices
2	calc_feature_statistics	get_tree_leaf_counts
3	calc_leaf_indexes	grid_search
4	classes_	is_fitted
5	compare	iterate_leaf_indexes
6	copy	learning_rate_
7	create_metric_calcer	load_model
8	drop_unused_features	plot_partial_dependence
9	eval_metrics	plot_predictions
10	evals_result_	plot_tree
11	feature_importances_	predict
12	feature_names_	predict_log_proba
13	fit	predict_proba
14	get_all_params	random_seed_
15	get_best_iteration	randomized_search
16	get_best_score	save_borders
17	get_borders	save_model
18	get_cat_feature_indices	score
19	get_evals_result	set_feature_names
20	get_feature_importance	set_leaf_values
21	get_leaf_values	set_params
22	get_leaf_weights	set_scale_and_bias
23	get_metadata	shrink
24	get_object_importance	staged_predict
25	get_param	staged_predict_log_proba
26	get_params	staged_predict_proba
27	get_scale_and_bias	tree_count_
28	get_test_eval

Weight	Feature
0.0787	1
0.0637	8
0.0607	14
0.0535	4
0.0519	26
0.0456	29
0.0428	0
0.0391	12
0.0390	22
0.0384	17
0.0382	3
0.0336	13
0.0320	20
0.0284	15
0.0283	25
0.0283	18
0.0276	2
0.0271	10
0.0258	24
0.0245	5
… 10 more …

Table of Contents

Data Description¶

Business Problem¶

Introduction to Boosting¶

Colab¶

Imports¶

Useful Scripts¶

Load the data¶

Train test split with stratify¶

Train Validation with stratify¶

Modelling catboost¶

Catboost with validation set¶

Feature Statistics¶

Feature Importance¶

catboost using Pool¶

Cross Validation¶

HPO (Hyper Parameter Optimization)¶

Baseline model¶

Using Early Stopping from Validation Set¶

Try Your luck with different random states¶

HPO Hyper Parameter Optimization with Optuna¶

Best Model¶

Model Interpretation¶

Model interpretation using eli5¶

Model interpretation using lime¶

Model Evaluation Using shap¶

Time Taken¶

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
138257	82565.0	1.118591	0.562709	0.569628	2.987496	-0.365594	-0.531789	-0.044144	0.011932	-0.129131	...	-0.204184	-0.128269	-0.218875	-0.048816	0.617265	0.551384	0.06022	0.016136	0.047100	7.6
60033	49125.0	1.170686	0.083759	0.466278	0.913911	-0.093123	0.427588	-0.372727	0.312777	0.129610	...	-0.226078	-0.176121	-0.584726	0.066051	-0.746667	0.232641	-0.54774	0.038060	0.010995	3.9