%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install scikit-plot
    !pip install lrcurve
    !pip install watermark

    ## print
    print('Environment: Google Colaboratory.')


import time
time_start_notebook = time.time()


import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)


import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

import os
import sys

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

import scipy
from scipy import stats

# scale and split
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

# deep learning
import tensorflow as tf
import keras.utils.vis_utils

# model evaluation
import scikitplot
from scikitplot import metrics as skmetrics
import lrcurve
from lrcurve import KerasLearningCurve

# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2021-08-08 

CPython 3.7.7
IPython 7.19.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

tensorflow 2.5.0
pandas     1.1.1
keras      2.5.0
scikitplot 0.3.7
scipy      1.4.1
sklearn    0.23.2
seaborn    0.10.1
matplotlib 3.2.1
numpy      1.19.5


def show_methods(method, ncols=3):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_methods(list)
     """
    x = [i for i in dir(method) if i[0]!='_' ]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')


def set_random_seed(seed):
    import os
    import random
    import numpy as np
    import tensorflow as tf
    
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


def model_evaluation(model_name, desc, ytest, yprobs1d,df_eval=None,
                     show=True,sort_col='Recall',threshold=0.5):
    if df_eval is None:
        df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC':[],
                        'AUCPR':[],
                    })
  
    # make sure yprobs is float
    if not isinstance(yprobs1d[0],float):
        print("Make sure to use probability values.")
        
    # make sure yprobs is float
    if not isinstance(yprobs1d[0],float):
        print("Make sure to use probability values.")

    # prediction from probs
    ypreds = (yprobs.flatten()>threshold).astype(np.int8)

    # model evaluation
    average = 'binary'
    prec,rec,thr = sklearn.metrics.precision_recall_curve(ytest,yprobs1d)
    auc_pr = sklearn.metrics.auc(rec,prec)
    row_eval = [model_name,desc, 
                sklearn.metrics.accuracy_score(ytest, ypreds),
                sklearn.metrics.precision_score(ytest, ypreds, average=average),
                sklearn.metrics.recall_score(ytest, ypreds, average=average),
                sklearn.metrics.f1_score(ytest, ypreds, average=average),
                sklearn.metrics.roc_auc_score(ytest, yprobs1d),
                auc_pr
                ]

    df_eval.loc[len(df_eval)] = row_eval
    df_eval = df_eval.drop_duplicates()
    df_eval = df_eval.sort_values(sort_col)

    if show:
        display(df_eval.style.background_gradient(subset=[sort_col]))

    return df_eval


ifile = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true"
# ifile = '../data/raw/creditcard.csv.zip'

df_raw = pd.read_csv(ifile,compression='zip')
print(df_raw.shape)
df_raw.head()

(284807, 31)


target = 'Class'
display(df_raw[target].value_counts())
sns.countplot(x=df_raw[target])

0    284315
1       492
Name: Class, dtype: int64

<matplotlib.axes._subplots.AxesSubplot at 0x7fa17e77b190>


neg, pos = np.bincount(df_raw['Class'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 284807
    Positive: 492 (0.17% of total)


cols_drop = ['Time']

df = df_raw.drop(cols_drop,axis=1)
df.shape

(284807, 30)


eps=0.001 # 0 => 0.1¢
df['Ammount'] = np.log(df.pop('Amount')+eps)


from sklearn.model_selection import train_test_split

target = 'Class'

df_Xtrain_orig,df_Xtest,ser_ytrain_orig,ser_ytest = train_test_split(df.drop([target],axis=1),
                                             df[target],
                                             test_size=0.2,
                                             stratify=df[target],
                                             random_state=SEED)

df_Xtrain,df_Xvalid,ser_ytrain,ser_yvalid = train_test_split(df_Xtrain_orig,
                                             ser_ytrain_orig,
                                             test_size=0.2,
                                             stratify=ser_ytrain_orig,
                                             random_state=SEED)

ytrain = np.array(ser_ytrain)
yvalid = np.array(ser_yvalid)
ytest = np.array(ser_ytest)

df.shape, df_Xtrain.shape, ser_ytrain.shape

((284807, 30), (182276, 29), (182276,))


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df_Xtrain)

Xtrain = scaler.transform(df_Xtrain)
Xvalid = scaler.transform(df_Xvalid)
Xtest  = scaler.transform(df_Xtest)

# clip the values
Xtrain = np.clip(Xtrain, -5, 5)
Xvalid = np.clip(Xvalid, -5, 5)
Xtest = np.clip(Xtest, -5, 5)


neg, pos = np.bincount(df_raw['Class'])
total = neg + pos

bias_initializer = np.log([pos/neg])
bias_initializer

array([-6.35935934])


weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.50
Weight for class 1: 289.44


n_feats = Xtrain.shape[-1]
class_weight = {0: weight_for_0, 1: weight_for_1}

#============================================================
PARAMS_MODEL = {
    # layer 1
    'L1_units': 16,
    'L1_act': 'relu',
    'L1_dropout': 0.5,

    # optimizer
    'adam_lr': 1e-3,
}

#============================================================
METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
      tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

#============================================================
PARAMS_FIT = {'epochs': 100,
          'batch_size': 2048,
          'class_weight0': class_weight[0],
          'class_weight1': class_weight[1],
          'patience': 10,
          'shuffle': True,
          }

#============================================================
# callbacks
cb_early = tf.keras.callbacks.EarlyStopping(
    monitor='val_prc', 
    verbose=1,
    patience=PARAMS_FIT['patience'],
    mode='max',
    restore_best_weights=True)

#cb_checkpt = keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")
cb_lr = lrcurve.KerasLearningCurve()
callbacks = [cb_early, cb_lr]


def make_model(metrics=METRICS, bias_initializer=None,n_feats=n_feats):
    if bias_initializer is not None:
        bias_initializer = tf.keras.initializers.Constant(bias_initializer)

    model = tf.keras.Sequential([
        # layer 1
        tf.keras.layers.Dense(PARAMS_MODEL['L1_units'],
                           activation=PARAMS_MODEL['L1_act'],
                           input_shape=(n_feats,)),

        tf.keras.layers.Dropout(PARAMS_MODEL['L1_dropout']),

        # last layer is dense 1 with activation sigmoid
        # last layer has bias_initializer (but not other layers)
        tf.keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=bias_initializer),
    ])

    model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=PARAMS_MODEL['adam_lr']),
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=metrics)

    return model

model = make_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 16)                480       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
=================================================================
Total params: 497
Trainable params: 497
Non-trainable params: 0
_________________________________________________________________


from tensorflow.keras.utils import plot_model

if not os.path.isdir('images'):
    os.makedirs('images')

# save keras sequential model
plot_model(model,'images/keras_model_simple.png')

# For functional model
# plot_model(model, 'keras_model_simple.png')


%%time
set_random_seed(SEED)
model = make_model(bias_initializer=bias_initializer)

history = model.fit(
    Xtrain,
    ytrain,
    batch_size=PARAMS_FIT['batch_size'],
    epochs=PARAMS_FIT['epochs'],
    callbacks=callbacks,
    validation_data=(Xvalid, yvalid),
    verbose=0,
    class_weight=class_weight
    )

WARNING:tensorflow:From /Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py:5049: calling gather (from tensorflow.python.ops.array_ops) with validate_indices is deprecated and will be removed in a future version.
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.

Restoring model weights from the end of the best epoch.
Epoch 00012: early stopping
CPU times: user 12.1 s, sys: 1.39 s, total: 13.5 s
Wall time: 16.7 s


from sklearn import metrics as skmetrics


yprobs = model.predict(Xtest)
yprobs[:5]

array([[0.00412297],
       [0.00278735],
       [0.01056832],
       [0.00396076],
       [0.0061807 ]], dtype=float32)


yprobs1d = yprobs.flatten()


ypreds = (yprobs1d>0.5).astype(np.int8)
ypreds[:5]

array([0, 0, 0, 0, 0], dtype=int8)


skmetrics.confusion_matrix(ytest, ypreds)

array([[56793,    71],
       [   22,    76]])


from scikitplot import metrics as skpmetrics


skpmetrics.plot_confusion_matrix(ytest,ypreds)

<matplotlib.axes._subplots.AxesSubplot at 0x7fa18fa87ed0>


def plot_cm(labels, predictions, p=0.5):
    cm = sklearn.metrics.confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
    print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
    print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
    print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
    print('Total Fraudulent Transactions: ', np.sum(cm[1]))


plot_cm(ytest,ypreds)

Legitimate Transactions Detected (True Negatives):  56793
Legitimate Transactions Incorrectly Detected (False Positives):  71
Fraudulent Transactions Missed (False Negatives):  22
Fraudulent Transactions Detected (True Positives):  76
Total Fraudulent Transactions:  98


desc = "simple model"
yprobs = model.predict(Xtest)
yprobs1d = yprobs.flatten()

assert len(ytest) == len(yprobs1d)

ypreds = (yprobs1d>0.5).astype(np.int8)

df_eval = model_evaluation("keras", desc, ytest, yprobs1d,df_eval=None)

Make sure to use probability values.
Make sure to use probability values.


matplotlib.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']


def plot_metrics(history):
    metrics =  ['loss', 'auc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                    color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

        plt.legend()


plot_metrics(history)


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 49 secs

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	0.624501	0.066084	0.717293	-0.165946	2.345865	-2.890083	1.109969	-0.121359	-2.261857	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	-0.226487	0.178228	0.507757	-0.287924	-0.631418	-1.059647	-0.684093	1.965775	-1.232622	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	-0.822843	0.538196	1.345852	-1.119670	0.175121	-0.451449	-0.237033	-0.038195	0.803487	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

Table of Contents

Introduction¶

Colab¶

Imports¶

Useful Scripts¶

Load the data¶

Data Processing¶

Class balance¶

Feature Selection¶

Log transform¶

Train-validation-test split with stratify¶

Normalize the data¶

Modelling: Keras Sequential¶

Bias Initializer¶

Class weights¶

Params and Metrics¶

Build the Model¶

Fit the model¶

Model Evaluation¶

Confusion Matrix¶

Accuracy Recall Scores¶

Training History Plots¶

Time Taken¶