import time
time_start_notebook = time.time()


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bp")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bp/bhishan")
from bhishan import bp


%load_ext autoreload
%autoreload 2


import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.options.display.float_format = '{:,}'.format # df.A.value_counts().astype(float)
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)

import IPython
from IPython.display import display, HTML, Image, Markdown

import scipy
from scipy import stats
from scikitplot import metrics as skmetrics

# scale and split
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# dimension reduction for visualization
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# hyperparameters search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# prediction
from sklearn.model_selection import cross_val_predict

# model selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# roc auc etc scores
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score

# roc auc curves
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

# confusion matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# pickle
import six
import joblib

# lightgbm
import lightgbm as lgb
from lightgbm import LGBMClassifier

# optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress

# model evaluation
import shap
import lime
import eli5
import yellowbrick
import scikitplot
import lime.lime_tabular
from eli5.sklearn import PermutationImportance

# plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff


# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The sklearn.metrics.scorer module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
The sklearn.feature_selection.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.
The sklearn.metrics.classification module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.

Bhishan Poudel 2021-08-08 

CPython 3.7.7
IPython 7.22.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

pandas      1.3.0
json        2.0.9
numpy       1.19.5
lightgbm    2.3.1
eli5        0.10.1
IPython     7.22.0
matplotlib  3.2.1
shap        0.39.0
yellowbrick 1.1
joblib      1.0.1
scikitplot  0.3.7
autopep8    1.5.2
optuna      2.7.0
scipy       1.6.2
sklearn     0.23.1
six         1.15.0
seaborn     0.11.0


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
from bhishan import bp


def print_scores(ytest,ypreds):
    print(f'Precision: {precision_score(ytest,ypreds): .2f}')
    print(f'Recall   : {recall_score(ytest,ypreds): .2f}')
    print(f'F1-score : {f1_score(ytest,ypreds): .2f}')

    c = classification_report(ytest, ypreds)
    print(c)

    cm = confusion_matrix(ytest,ypreds)
    names = ['Not-converted','Converted']
    df_cm = pd.DataFrame(cm,index=names,columns=names)
    df_cm = df_cm.style.background_gradient()
    display(df_cm)


df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F0.5':[],
                        'F1':[],
                        'F2':[],
                        'AUC':[],
                        'AUCPR':[],
                        'Time Taken': [],
                        'Time Taken Sec': [],
                    })


def get_row_eval(model,model_name,desc,df_eval,
                 df_Xtrain,ser_ytrain,
                 df_Xtest,ser_ytest,
                 kw_fit={},
                 sort='F2',
                 threshold=0.5,
                 average='binary',
                 show=True
                ):
    from sklearn import metrics as skmetrics
    from sklearn import model_selection

    time_start = time.time()
    model.fit(df_Xtrain, ser_ytrain,**kw_fit)

    # cross-validation on train data
    skf = model_selection.StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
    trprobs_cv = model_selection.cross_val_predict(model, df_Xtrain, ser_ytrain,
                                   cv=skf,method='predict_proba')
    trprobs1d = trprobs_cv[:,1] # take 2nd column for probability
    trpreds = (trprobs1d>threshold).astype(np.int8)

    ytr = np.array(ser_ytrain).flatten()

    time_taken_sec = time.time() - time_start
    m,s = divmod(time_taken_sec,60)
    time_taken = f"{s:.2f} sec" if not m else f"{m} min {s:.2f} sec"

    prec,rec,thr = sklearn.metrics.precision_recall_curve(ytr,trprobs1d)
    auc_pr = sklearn.metrics.auc(rec,prec)

    row_eval = [model_name,desc, 
                skmetrics.accuracy_score(ytr, trpreds),
                skmetrics.precision_score(ytr, trpreds, average=average,zero_division=0),
                skmetrics.recall_score(ytr, trpreds, average=average,zero_division=0),
                skmetrics.fbeta_score(ytr, trpreds, average=average,beta=0.5,zero_division=0),
                skmetrics.f1_score(ytr, trpreds, average=average,zero_division=0),
                skmetrics.fbeta_score(ytr, trpreds, average=average,beta=2,zero_division=0),
                skmetrics.roc_auc_score(ytr, trprobs1d), # for auc, we need probs
                auc_pr,
                time_taken,
                time_taken_sec
               ]
    
    df_eval.loc[len(df_eval)] = row_eval
    df_eval = df_eval.drop_duplicates(subset=['Model','Description'])
    df_eval = df_eval.sort_values(sort,ascending=False)

    # predict on test data
    yprobs = model.predict_proba(df_Xtest)
    yprobs1d = yprobs[:,1] # take 2nd column element
    ypreds = (yprobs1d>threshold).astype(np.int8)
    ytx = np.array(ser_ytest).flatten()

    if show:
        # df_eval
        display(df_eval)
        
        # confusion matrix
        print(skmetrics.confusion_matrix(ytx, ypreds))
        print(skmetrics.classification_report(ytx,ypreds))

        # feature importance
        fig,ax = plt.subplots(figsize=(12,8))
        lgb.plot_importance(model,ax=ax)
        plt.show()

    return df_eval,ypreds,yprobs1d


df = pd.read_csv('../data/raw/creditcard.csv.zip',compression='zip')
print(df.shape)
df.head()

(284807, 31)


from sklearn.model_selection import train_test_split

target = 'Class'
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
    df.drop(target,axis=1), 
    df[target],
    test_size=0.2, 
    random_state=SEED, 
    stratify=df[target])


df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_orig, 
    ser_ytrain_orig,
    test_size=0.2, 
    random_state=SEED, 
    stratify=ser_ytrain_orig)

print(df_Xtrain.shape)
df_Xtrain.head()

(182276, 30)


Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()


# model
model = lgb.LGBMClassifier(random_state=SEED)

model_name = 'lightgbm'
desc = 'default'
df_eval,ypreds,yprobs = get_row_eval(model,model_name,desc,df_eval,Xtr,ytr,Xtx,ytx)

[[56758   106]
 [   36    62]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.37      0.63      0.47        98

    accuracy                           1.00     56962
   macro avg       0.68      0.82      0.73     56962
weighted avg       1.00      1.00      1.00     56962


print_scores(ytx,ypreds)

Precision:  0.37
Recall   :  0.63
F1-score :  0.47
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.37      0.63      0.47        98

    accuracy                           1.00     56962
   macro avg       0.68      0.82      0.73     56962
weighted avg       1.00      1.00      1.00     56962


import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress

dtrain = lgb.Dataset(df_Xtrain, label= ser_ytrain)

def objective(trial):
    params_lgb_optuna = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,
        'boosting_type': 'gbdt',
        # lambda
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        # leaves
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        # fraction
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.2, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        # child
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-5, 1e4),
    }

    booster_gbm = lgb.train(params_lgb_optuna, dtrain)
    
    vdprobs1d = booster_gbm.predict(df_Xvalid)
    #vdpreds = np.rint(vdprobs1d)
    threshold = 0.5
    vdpreds = (vdprobs1d > threshold).astype(np.int8)

    score = roc_auc_score(ser_yvalid.to_numpy().ravel(),vdprobs1d)
    return score


booster_gbm = lgb.train({'objective': 'binary','metric': 'binary_logloss'}, dtrain)


bp.show_methods(booster_gbm)


vdprobs1d = booster_gbm.predict(df_Xvalid)
threshold = 0.5
ypreds = (vdprobs1d > threshold).astype(np.int8)
sum(ypreds)

185


# NOTE: there is inherent non-determinism in optuna hyperparameter selection
#       we may not get the same hyperparameters when run twice.

sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 10 # make it large

study = optuna.create_study(direction='maximize',
                            sampler=sampler,
                            study_name='lgb_optuna',
                            storage='sqlite:///lgb_optuna_fraud_detection.db',
                            load_if_exists=True)

study.optimize(objective, n_trials=N_TRIALS)


# Resume from last study
N_TRIALS = 10 # make it large

study = optuna.create_study(direction='maximize',
                            sampler=sampler,
                            study_name='lgb_optuna',
                            storage='sqlite:///lgb_optuna_fraud_detection.db',
                            load_if_exists=True)

study.optimize(objective, n_trials=N_TRIALS)


print(f'Number of finished trials: {len(study.trials)}')

# best trail
best_trial = study.best_trial

# best params
params_best = study.best_trial.params
params_best

Number of finished trials: 40

{'bagging_fraction': 0.6266899190322015,
 'bagging_freq': 3,
 'feature_fraction': 0.5548995142094996,
 'lambda_l1': 1.8249263636220374e-08,
 'lambda_l2': 0.0005661533776262685,
 'min_child_samples': 31,
 'min_child_weight': 5.529010386703527,
 'num_leaves': 147,
 'subsample': 0.7093293548765993}


# time
time_start = time.time()

model_name = 'lightgbm'
desc = 'grid search optuna'

# use best model
params_best =  study.best_trial.params

model = lgb.LGBMClassifier(random_state=SEED)
model.set_params(**params_best)

df_eval,ypreds,yprobs = get_row_eval(model,model_name,desc,df_eval,Xtr,ytr,Xtx,ytx)

[[56857     7]
 [   24    74]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.91      0.76      0.83        98

    accuracy                           1.00     56962
   macro avg       0.96      0.88      0.91     56962
weighted avg       1.00      1.00      1.00     56962


%%time

X = df.drop('Class',axis=1).to_numpy()
y = df['Class'].to_numpy()

model = lgb.LGBMClassifier(random_state=SEED)
model.set_params(**params_best)

model.fit(Xtr,ytr)

scores = cross_val_score(model,
                         X,y,
                         scoring ='f1',
                         cv=5,
                         n_jobs=-1,
                         verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.

CPU times: user 7.38 s, sys: 452 ms, total: 7.83 s
Wall time: 15.8 s

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.0s finished


trace = go.Table(
    header=dict(values=['<b>F1 score mean<b>', '<b>F1 score std<b>'],
                line = dict(color='#7D7F80'),
                fill = dict(color='#a1c3d1'),
                align = ['center'],
                font = dict(size = 15)),
    cells=dict(values=[np.round(scores.mean(),6),
                       np.round(scores.std(),6)],
               line = dict(color='#7D7F80'),
               fill = dict(color='#EDFAFF'),
               align = ['center'], font = dict(size = 15)))

layout = dict(width=800, height=500,
              title = 'Cross validation - 5 folds [F1 score]',
              font = dict(size = 15))
fig = dict(data=[trace], layout=layout)
py.iplot(fig, filename = '../reports/figures/lightgbm_cross_validation.html')


df.head(2)


import eli5

eli5.show_weights(model)


from eli5.sklearn import PermutationImportance

feature_names = df_Xtrain.columns.tolist()
perm = PermutationImportance(model).fit(df_Xtest, ytx)
eli5.show_weights(perm, feature_names=feature_names)


df.head(2)


idx = 0
example = df_Xtest.iloc[idx]
answer = ser_ytest.iloc[idx]
feature_names = df_Xtest.columns.tolist()

prediction = model.predict(example.to_numpy().reshape(-1,1).T)


print(f'answer     = {answer}')
print('prediction = ', prediction[0])
print()
print(example)
print(feature_names)

answer     = 0
prediction =  0

Time               113,050.0
V1         0.114697289499555
V2         0.796303316904746
V3        -0.149552970484065
V4        -0.823010911929308
V5         0.878762929901124
V6        -0.553151668076057
V7         0.939258608931089
V8        -0.108502082160203
V9          0.11113713552423
V10       -0.390521089642943
V11        -1.94954563276141
V12       -0.494436160725985
V13       -0.353696342510452
V14        0.158729133570328
V15       -0.267238817560267
V16        0.234802455099954
V17        -0.75493606804519
V18       -0.343012138433526
V19        0.312174599857807
V20      -0.0427111647779897
V21       -0.335776322866296
V22       -0.807853023303226
V23      -0.0559403173868963
V24        -1.02528148104425
V25        -0.36955723229602
V26        0.204653108226638
V27        0.242724346894465
V28       0.0857125564092851
Amount                  0.89
Name: 159949, dtype: float64
['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

Usage of np.ndarray subset (sliced data) is not recommended due to it will double the peak memory cost in LightGBM.


import lime
import lime.lime_tabular

# categorical_features = []
# categorical_features_idx = [df_Xtrain.columns.get_loc(col) 
#                               for col in categorical_features]

NUM_FEATURES = len(feature_names)
explainer = lime.lime_tabular.LimeTabularExplainer(df_Xtrain.to_numpy(), 
               feature_names=feature_names, 
               class_names=['Not-fraud','Fraud'], 
               mode='classification')

exp = explainer.explain_instance(example, model.predict_proba,
                                 num_features=NUM_FEATURES)
exp.show_in_notebook(show_table=True)


ax = exp.as_pyplot_figure(); # use semicolon
ax.set_figheight(12);


# show_method_attributes(ax,start='set')


import shap

shap.initjs()


show_method_attributes(shap)

Object Type: <class 'module'>


%%time

explainer = shap.TreeExplainer(model)
shap_values = np.array(explainer.shap_values(df_Xtest))

CPU times: user 14.2 s, sys: 91.7 ms, total: 14.3 s
Wall time: 4.68 s

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


# shap.force_plot?


df_Xtest.shape, explainer.expected_value, type(explainer.expected_value), len(explainer.expected_value)

((56962, 30), [9.904950042902943, -9.904950042902943], list, 2)


idx = 5
shap.force_plot(explainer.expected_value[1],
                shap_values[1][idx,:],
                df_Xtest.iloc[idx,:] # this is just for giving feature names
               )


# many points
NUM = 1000
shap.force_plot(explainer.expected_value[1],
                shap_values[1][:NUM,:],
                df_Xtest.iloc[:NUM,:] # this is just for giving feature names
               )


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 2 min 30 secs

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.3598071336738	-0.0727811733098497	2.53634673796914	1.37815522427443	-0.338320769942518	0.462387777762292	0.239598554061257	0.0986979012610507	0.363786969611213	0.0907941719789316	-0.551599533260813	-0.617800855762348	-0.991389847235408	-0.311169353699879	1.46817697209427	-0.470400525259478	0.207971241929242	0.0257905801985591	0.403992960255733	0.251412098239705	-0.018306777944153	0.277837575558899	-0.110473910188767	0.0669280749146731	0.128539358273528	-0.189114843888824	0.133558376740387	-0.0210530534538215	149.62
1	0.0	1.19185711131486	0.26615071205963	0.16648011335321	0.448154078460911	0.0600176492822243	-0.0823608088155687	-0.0788029833323113	0.0851016549148104	-0.255425128109186	-0.166974414004614	1.61272666105479	1.06523531137287	0.48909501589608	-0.143772296441519	0.635558093258208	0.463917041022171	-0.114804663102346	-0.183361270123994	-0.145783041325259	-0.0690831352230203	-0.225775248033138	-0.638671952771851	0.101288021253234	-0.339846475529127	0.167170404418143	0.125894532368176	-0.0089830991432281	0.0147241691924927	2.69
2	1.0	-1.35835406159823	-1.34016307473609	1.77320934263119	0.379779593034328	-0.503198133318193	1.80049938079263	0.791460956450422	0.247675786588991	-1.51465432260583	0.207642865216696	0.624501459424895	0.066083685268831	0.717292731410831	-0.165945922763554	2.34586494901581	-2.89008319444231	1.10996937869599	-0.121359313195888	-2.26185709530414	0.524979725224404	0.247998153469754	0.771679401917229	0.909412262347719	-0.689280956490685	-0.327641833735251	-0.139096571514147	-0.0553527940384261	-0.0597518405929204	378.66
3	1.0	-0.966271711572087	-0.185226008082898	1.79299333957872	-0.863291275036453	-0.0103088796030823	1.24720316752486	0.23760893977178	0.377435874652262	-1.38702406270197	-0.0549519224713749	-0.226487263835401	0.178228225877303	0.507756869957169	-0.28792374549456	-0.631418117709045	-1.0596472454325	-0.684092786345479	1.96577500349538	-1.2326219700892	-0.208037781160366	-0.108300452035545	0.0052735967825345	-0.190320518742841	-1.17557533186321	0.647376034602038	-0.221928844458407	0.0627228487293033	0.0614576285006353	123.5
4	2.0	-1.15823309349523	0.877736754848451	1.548717846511	0.403033933955121	-0.407193377311653	0.0959214624684256	0.592940745385545	-0.270532677192282	0.817739308235294	0.753074431976354	-0.822842877946363	0.53819555014995	1.3458515932154	-1.11966983471731	0.175121130008994	-0.451449182813529	-0.237033239362776	-0.0381947870352842	0.803486924960175	0.408542360392758	-0.0094306971323291	0.79827849458971	-0.137458079619063	0.141266983824769	-0.206009587619756	0.502292224181569	0.219422229513348	0.215153147499206	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
138257	82,565.0	1.11859127825998	0.562708615614581	0.569628132172694	2.98749567105308	-0.365594117191008	-0.53178891607535	-0.0441441249518224	0.0119318115692382	-0.129131427848726	0.0849081565743621	-0.540333553184882	-0.405535179178872	-1.12449319662053	-1.11871638400226	-0.708344083228945	0.403484521999855	0.89014513354152	-0.0253850527776957	-0.901995325192936	-0.20418403655852	-0.128269305733937	-0.218874505626759	-0.0488163716577826	0.617265327928864	0.551383852992399	0.0602202164704386	0.0161364657843415	0.0471004238374625	7.6
60033	49,125.0	1.17068622428586	0.0837586462868538	0.466278464687971	0.913911105187985	-0.0931233280534145	0.427588359510079	-0.372726903094763	0.312776807324368	0.129610439285404	0.188107405142176	0.707980369024214	0.0252122485980969	-1.34149123316402	0.695830794891532	0.890825865939713	0.588590292849305	-0.703942987135041	0.247762788136479	-0.221214803414507	-0.226078189691347	-0.176120706455713	-0.584725605022912	0.0660507633953362	-0.746666958902139	0.232641206443977	-0.547740351492877	0.0380595170058819	0.0109949773040646	3.9
31064	36,195.0	1.07290244801688	-0.0151657747657395	0.942250546807478	1.33063117887976	-0.580474206203662	0.206234895815589	-0.402120891949741	0.31313297852691	0.410088272599882	0.0282224315408947	1.18250930627367	0.70922235548697	-1.40543311438519	0.361666199556206	-0.448324601820023	-0.221590911055599	0.0443126891037833	-0.438542146969058	-0.139385167074192	-0.251464432725128	-0.261719672080143	-0.66572466761628	0.16753475404378	0.16381522977874	0.192246899637426	-0.620974380092519	0.0506094503879474	0.0191811160182732	9.9
245706	152,869.0	2.13690895693366	0.0886462179481961	-2.49091417482061	0.098321044575947	0.789008357338609	-1.39958230925586	0.854902018700341	-0.492911714525658	-0.254999494950959	0.423266663587826	0.537589579310604	0.359201388101551	-0.881515243202571	1.13338807834543	-0.852639540534027	-0.472625411623177	-0.395754373135845	-0.0803761218444183	0.509355447065211	-0.266382528240024	0.278034370432688	0.934892042875776	-0.211838843107761	-0.234265525688845	0.609699115232944	1.0208984195809	-0.154426894059047	-0.112532032632011	2.0
25871	33,805.0	-2.44837759416049	-1.33550776962431	1.24043086970596	1.80006772150734	0.383084074600485	-0.501159877615605	1.08040956757211	-0.60409287158821	-0.31945786831444	0.284933549468543	0.22803039313114	-0.109462552980001	0.0449303122994679	0.0313506514415897	1.81120324676505	-0.586386626989747	0.126218727182111	-0.420907177328929	0.315894920729687	-0.72057196499151	-0.121319219568584	0.625540529504405	-0.639100480803456	0.522531746694956	-0.0738009695488949	-0.16278758737067	0.294911791393001	-0.21122240075315	411.1

	0	1	2
0	add_valid	free_network	params
1	attr	get_leaf_output	predict
2	best_iteration	get_split_value_histogram	refit
3	best_score	handle	reset_parameter
4	current_iteration	model_from_string	rollback_one_iter
5	dump_model	model_to_string	save_model
6	eval	name_valid_sets	set_attr
7	eval_train	network	set_network
8	eval_valid	num_feature	set_train_data_name
9	feature_importance	num_model_per_iteration	shuffle_models
10	feature_name	num_trees	update
11	free_dataset	pandas_categorical

	Model	Description	Accuracy	Precision	Recall	F0.5	F1	F2	AUC	AUCPR	Time Taken	Time Taken Sec
1	lightgbm	grid search optuna	0.9994513814215804	0.8944281524926686	0.7741116751269036	0.8674630261660977	0.8299319727891157	0.7955138236828377	0.9747168057416373	0.8329147728927615	5.85 sec	5.851884126663208
0	lightgbm	default	0.9957778314204833	0.19851380042462846	0.4746192893401015	0.22465160980297938	0.27994011976047906	0.37132644956314537	0.6490154615105698	0.2980270352246768	9.46 sec	9.460867166519165

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
0	0.0	-1.3598071336738	-0.0727811733098497	2.53634673796914	1.37815522427443	-0.338320769942518	0.462387777762292	0.239598554061257	0.0986979012610507	0.363786969611213	0.0907941719789316	-0.551599533260813	-0.617800855762348	-0.991389847235408	-0.311169353699879	1.46817697209427	-0.470400525259478	0.207971241929242	0.0257905801985591	0.403992960255733	0.251412098239705	-0.018306777944153	0.277837575558899	-0.110473910188767	0.0669280749146731	0.128539358273528	-0.189114843888824	0.133558376740387	-0.0210530534538215	149.62	0
1	0.0	1.19185711131486	0.26615071205963	0.16648011335321	0.448154078460911	0.0600176492822243	-0.0823608088155687	-0.0788029833323113	0.0851016549148104	-0.255425128109186	-0.166974414004614	1.61272666105479	1.06523531137287	0.48909501589608	-0.143772296441519	0.635558093258208	0.463917041022171	-0.114804663102346	-0.183361270123994	-0.145783041325259	-0.0690831352230203	-0.225775248033138	-0.638671952771851	0.101288021253234	-0.339846475529127	0.167170404418143	0.125894532368176	-0.0089830991432281	0.0147241691924927	2.69	0

Table of Contents

Data Description¶

Imports¶

Useful Scripts¶

Load the data¶

Train Validation Test Split with Stratify¶

Model: LightGBM¶

HPO Hyper Parameter Optimization with Optuna¶

LightGBM cross validation¶

Model Interpretation¶

Model interpretation using eli5¶

Model interpretation using lime¶

Model interpretation using shap¶

Time taken¶

Weight	Feature
0.4097	V14
0.2378	V10
0.0928	V12
0.0359	V4
0.0328	V17
0.0154	V26
0.0139	Amount
0.0134	V7
0.0131	V8
0.0111	V9
0.0098	V21
0.0094	V27
0.0083	V28
0.0076	V16
0.0076	V13
0.0074	V6
0.0072	V25
0.0071	V5
0.0064	V20
0.0059	V3
… 10 more …

Weight	Feature
0.0009 ± 0.0000	V14
0.0003 ± 0.0000	V10
0.0001 ± 0.0001	V26
0.0001 ± 0.0001	Amount
0.0001 ± 0.0001	V4
0.0001 ± 0.0000	V17
0.0000 ± 0.0000	V28
0.0000 ± 0.0000	V27
0.0000 ± 0.0000	V8
0.0000 ± 0.0000	V13
0.0000 ± 0.0000	V11
0.0000 ± 0.0000	V1
0.0000 ± 0.0000	V16
0.0000 ± 0.0000	V6
0.0000 ± 0.0000	V5
0.0000 ± 0.0000	V2
0.0000 ± 0.0000	V20
0.0000 ± 0.0000	V24
0.0000 ± 0.0000	V9
0.0000 ± 0.0000	V22
… 10 more …

	0	1	2	3	4	5	6
0	ActionOptimizer	GradientExplainer	actions	explainers	kmeans	multioutput_decision_plot	summary_plot
1	AdditiveExplainer	KernelExplainer	approximate_interactions	force_plot	links	other	text_plot
2	Cohorts	LinearExplainer	bar_plot	getjs	maskers	partial_dependence_plot	unsupported
3	DeepExplainer	PartitionExplainer	datasets	group_difference_plot	matplotlib	plots	utils
4	Explainer	PermutationExplainer	decision_plot	have_matplotlib	models	sample	warnings
5	Explanation	SamplingExplainer	dependence_plot	image_plot	monitoring_plot	save_html	waterfall_plot
6	GPUTreeExplainer	TreeExplainer	embedding_plot	initjs

	Not-converted	Converted
Not-converted	56758	106
Converted	36	62