%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install scikit-plot
    !pip install lrcurve
    !pip install watermark
    !pip install -U scikit-learn

    ## print
    print('Environment: Google Colaboratory.')


import time
time_start_notebook = time.time()


import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)


import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (6, 4)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

import os
import sys

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

import scipy
from scipy import stats

# scale and split
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

# deep learning
import tensorflow
import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy

# model evaluation
import scikitplot
from scikitplot import metrics as skmetrics
import lrcurve
from lrcurve import KerasLearningCurve

# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2020-10-05 

CPython 3.6.9
IPython 5.5.0

compiler   : GCC 8.4.0
system     : Linux
release    : 4.19.112+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit

pandas     1.1.2
keras      2.4.3
scipy      1.4.1
matplotlib 3.2.2
tensorflow 2.3.0
scikitplot 0.3.7
seaborn    0.11.0
sklearn    0.23.2
numpy      1.18.5


def show_methods(method, ncols=3):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_methods(list)
     """
    x = [i for i in dir(method) if i[0]!='_' ]
    x = [i for i in x 
         if i not in 'os np pd sys time psycopg2'.split() ]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')


def set_random_seed(seed):
    import os
    import random
    import numpy as np
    import tensorflow as tf
    
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


def model_evaluation(model_name, desc, ytest, ypreds,df_eval=None,
                     show=True,sort_col='Recall'):
    if df_eval is None:
        df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC':[],
                    })

    # model evaluation
    average = 'binary'
    row_eval = [model_name,desc, 
                sklearn.metrics.accuracy_score(ytest, ypreds),
                sklearn.metrics.precision_score(ytest, ypreds, average=average),
                sklearn.metrics.recall_score(ytest, ypreds, average=average),
                sklearn.metrics.f1_score(ytest, ypreds, average=average),
                sklearn.metrics.roc_auc_score(ytest, ypreds),
                ]

    df_eval.loc[len(df_eval)] = row_eval
    df_eval = df_eval.drop_duplicates()
    df_eval = df_eval.sort_values(sort_col)

    if show:
        display(df_eval.style.background_gradient(subset=[sort_col]))

    return df_eval


ifile = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true"
# ifile = '../data/raw/creditcard.csv.zip'

df_raw = pd.read_csv(ifile,compression='zip')
print(df_raw.shape)
df_raw.head()

(284807, 31)


target = 'Class'
display(df_raw[target].value_counts())
sns.countplot(x=df_raw[target])

0    284315
1       492
Name: Class, dtype: int64

<matplotlib.axes._subplots.AxesSubplot at 0x7f64c7d0e898>


neg, pos = np.bincount(df_raw['Class'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 284807
    Positive: 492 (0.17% of total)


cols_drop = ['Time']

df = df_raw.drop(cols_drop,axis=1)
df.shape

(284807, 30)


eps=0.001 # 0 => 0.1¢
df['Ammount'] = np.log(df.pop('Amount')+eps)


from sklearn.model_selection import train_test_split

target = 'Class'

df_Xtrain_orig,df_Xtest,ser_ytrain_orig,ser_ytest = train_test_split(df.drop([target],axis=1),
                                             df[target],
                                             test_size=0.2,
                                             stratify=df[target],
                                             random_state=SEED)

df_Xtrain,df_Xvalid,ser_ytrain,ser_yvalid = train_test_split(df_Xtrain_orig,
                                             ser_ytrain_orig,
                                             test_size=0.2,
                                             stratify=ser_ytrain_orig,
                                             random_state=SEED)

ytrain = np.array(ser_ytrain)
yvalid = np.array(ser_yvalid)
ytest = np.array(ser_ytest)

df.shape, df_Xtrain.shape, ser_ytrain.shape

((284807, 30), (182276, 29), (182276,))


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df_Xtrain)

Xtrain = scaler.transform(df_Xtrain)
Xvalid = scaler.transform(df_Xvalid)
Xtest  = scaler.transform(df_Xtest)

# clip the values
Xtrain = np.clip(Xtrain, -5, 5)
Xvalid = np.clip(Xvalid, -5, 5)
Xtest = np.clip(Xtest, -5, 5)


neg, pos = np.bincount(df_raw['Class'])
total = neg + pos

initial_bias = np.log([pos/neg])
initial_bias

array([-6.35935934])


weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.50
Weight for class 1: 289.44


n_feats = Xtrain.shape[-1]
class_weight = {0: weight_for_0, 1: weight_for_1}

#============================================================
PARAMS_MODEL = {
    # layer 1
    'L1_units': 16,
    'L1_act': 'relu',
    'L1_dropout': 0.5,

    # optimizer
    'adam_lr': 1e-3,
}

#============================================================
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

#============================================================
PARAMS_FIT = {'epochs': 100,
          'batch_size': 2048,
          'class_weight0': class_weight[0],
          'class_weight1': class_weight[1],
          'patience': 10,
          'shuffle': True,
          }

#============================================================
PARAMS_CLF   = {'class_weight' : class_weight}


#============================================================
# callbacks
cb_early = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=PARAMS_FIT['patience'],
    mode='max',
    restore_best_weights=True)

#cb_checkpt = keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")
cb_lr = lrcurve.KerasLearningCurve()
callbacks = [cb_early, cb_lr]


def one_layer(metrics=METRICS, output_bias=None,n_feats=n_feats):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)

    model = keras.Sequential([
        # layer 1
        keras.layers.Dense(PARAMS_MODEL['L1_units'],
                           activation=PARAMS_MODEL['L1_act'],
                           input_shape=(n_feats,)),

        keras.layers.Dropout(PARAMS_MODEL['L1_dropout']),

        # last layer is dense 1 with activation sigmoid
        keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=output_bias),
    ])

    model.compile(
      optimizer=keras.optimizers.Adam(lr=PARAMS_MODEL['adam_lr']),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

    return model

model = one_layer()
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_22 (Dense)             (None, 16)                480       
_________________________________________________________________
dropout_11 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 17        
=================================================================
Total params: 497
Trainable params: 497
Non-trainable params: 0
_________________________________________________________________


from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score


clf = KerasClassifier(one_layer,
                    batch_size=PARAMS_FIT['batch_size'],
                    epochs=PARAMS_FIT['epochs'],
                    class_weight=class_weight,
                    verbose=0)


# clf.fit accepts all sequential model.fit parameters


%%time
set_random_seed(SEED)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

scores = []
for idx_tr, idx_vd in skf.split(Xtrain, ytrain):
    X_cv, y_cv = Xtrain[idx_tr], ytrain[idx_tr]
    clf.fit(X_cv, y_cv)
    scores.append(clf.score(X_cv, y_cv))

CPU times: user 7min 23s, sys: 27.8 s, total: 7min 51s
Wall time: 4min 26s


print(scores)

[0.9883418083190918, 0.9879372715950012, 0.9879372715950012, 0.9881910085678101, 0.9894047975540161]


mean_acc = np.mean(scores)
mean_acc

0.9883624315261841


# show_methods(model)


# clf.get_params()


# clf.sk_params


# # last validation scores
# clf.fit(X_cv, y_cv)
# ypreds = clf.predict(X_cv).flatten()

# df_tmp = model_evaluation("keras", '', y_cv, ypreds,df_eval=None)
# score = clf.score(X_cv,y_cv)

# # I found that score is accuracy score


%%time
history = clf.fit(Xtrain, ytrain,
                  validation_data=(Xvalid,yvalid),
                  callbacks=[cb_early]
                  )

Restoring model weights from the end of the best epoch.
Epoch 00023: early stopping
CPU times: user 33.4 s, sys: 2.04 s, total: 35.4 s
Wall time: 21.1 s


show_methods(history)


df_history = pd.DataFrame(history.history)
df_history.head()


from sklearn import metrics as skmetrics


# keras classifier has predict_proba but sequential does not
try:
    yprobs = clf.predict_proba(Xtest)
except:
    yprobs = clf.predict(Xtest)
yprobs[:5]

array([[4.8686218e-01, 5.1313782e-01],
       [9.9999642e-01, 3.5917092e-06],
       [9.9693179e-01, 3.0682087e-03],
       [9.9992031e-01, 7.9715690e-05],
       [9.9840224e-01, 1.5977919e-03]], dtype=float32)


if yprobs.shape[1] == 2:
    yprobs = yprobs[:,1]

ypreds = (yprobs.flatten()>0.5).astype(np.int8)
ypreds[:5]

array([1, 0, 0, 0, 0], dtype=int8)


Xtest.shape, ytest.shape, ypreds.shape

((56962, 29), (56962,), (56962,))


skmetrics.confusion_matrix(ytest, ypreds)

array([[56190,   674],
       [   13,    85]])


from scikitplot import metrics as skpmetrics


skpmetrics.plot_confusion_matrix(ytest,ypreds)

<matplotlib.axes._subplots.AxesSubplot at 0x7f64c8270b70>


def plot_cm(labels, predictions, p=0.5):
  cm = sklearn.metrics.confusion_matrix(labels, predictions > p)
  plt.figure(figsize=(5,5))
  sns.heatmap(cm, annot=True, fmt="d")
  plt.title('Confusion matrix @{:.2f}'.format(p))
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

  print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
  print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
  print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
  print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
  print('Total Fraudulent Transactions: ', np.sum(cm[1]))


plot_cm(ytest,ypreds)

Legitimate Transactions Detected (True Negatives):  56190
Legitimate Transactions Incorrectly Detected (False Positives):  674
Fraudulent Transactions Missed (False Negatives):  13
Fraudulent Transactions Detected (True Positives):  85
Total Fraudulent Transactions:  98


desc = "simple model"
yprobs = model.predict(Xtest)

ypreds = ypreds.ravel()
ypreds = (yprobs.flatten()>0.5).astype(np.int8)

df_eval = model_evaluation("keras", desc, ytest, ypreds,df_eval=None)


df_history.head(2)


def plot_metrics(history):
    metrics =  ['loss', 'auc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                    color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

        plt.legend()


plot_metrics(history)


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	0.624501	0.066084	0.717293	-0.165946	2.345865	-2.890083	1.109969	-0.121359	-2.261857	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	-0.226487	0.178228	0.507757	-0.287924	-0.631418	-1.059647	-0.684093	1.965775	-1.232622	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	-0.822843	0.538196	1.345852	-1.119670	0.175121	-0.451449	-0.237033	-0.038195	0.803487	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	0	1	2
0	epoch	on_predict_batch_end	on_train_batch_end
1	history	on_predict_begin	on_train_begin
2	model	on_predict_end	on_train_end
3	on_batch_begin	on_test_batch_begin	params
4	on_batch_end	on_test_batch_end	set_model
5	on_epoch_begin	on_test_begin	set_params
6	on_epoch_end	on_test_end	validation_data
7	on_predict_batch_begin	on_train_batch_begin

	loss	tp	fp	tn	fn	accuracy	precision	recall	auc	val_loss	val_tp	val_fp	val_tn	val_fn	val_accuracy	val_precision	val_recall	val_auc
0	0.683066	229.0	43589.0	183862.0	165.0	0.807966	0.005226	0.581218	0.750562	0.478051	63.0	5305.0	40185.0	16.0	0.883232	0.011736	0.797468	0.867813
1	0.490947	216.0	28510.0	153451.0	99.0	0.843046	0.007519	0.685714	0.815893	0.396563	64.0	2119.0	43371.0	15.0	0.953170	0.029317	0.810127	0.921812
2	0.390479	236.0	19219.0	162742.0	79.0	0.894128	0.012131	0.749206	0.886122	0.333194	67.0	1204.0	44286.0	12.0	0.973315	0.052714	0.848101	0.937796
3	0.316840	258.0	13002.0	168959.0	57.0	0.928356	0.019457	0.819048	0.933932	0.280624	68.0	913.0	44577.0	11.0	0.979723	0.069317	0.860759	0.949478
4	0.289923	265.0	9150.0	172811.0	50.0	0.949527	0.028147	0.841270	0.942312	0.239485	68.0	736.0	44754.0	11.0	0.983607	0.084577	0.860759	0.956797

	loss	tp	fp	tn	fn	accuracy	precision	recall	auc	val_loss	val_tp	val_fp	val_tn	val_fn	val_accuracy	val_precision	val_recall	val_auc
0	0.683066	229.0	43589.0	183862.0	165.0	0.807966	0.005226	0.581218	0.750562	0.478051	63.0	5305.0	40185.0	16.0	0.883232	0.011736	0.797468	0.867813
1	0.490947	216.0	28510.0	153451.0	99.0	0.843046	0.007519	0.685714	0.815893	0.396563	64.0	2119.0	43371.0	15.0	0.953170	0.029317	0.810127	0.921812

Table of Contents

Colab¶

Imports¶

Useful Scripts¶

Load the data¶

Data Processing¶

Class balance¶

Feature Selection¶

Log transform¶

Train-validation-test split with stratify¶

Normalize the data¶

Modelling: Keras Sequential¶

Correct initial bias¶

Class weights¶

Params and Metrics¶

Build the Model¶

Fit the model using KerasClassifier¶

Stratified Cross Validation¶

Model Evaluation¶

Confusion Matrix¶

Accuracy Recall Scores¶

Training History Plots¶

Time Taken¶