%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install scikit-plot
!pip install lrcurve
!pip install watermark
!pip install -U scikit-learn
## print
print('Environment: Google Colaboratory.')
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (6, 4)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
import os
import sys
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import scipy
from scipy import stats
# scale and split
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
# deep learning
import tensorflow
import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
# model evaluation
import scikitplot
from scikitplot import metrics as skmetrics
import lrcurve
from lrcurve import KerasLearningCurve
# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-10-05 CPython 3.6.9 IPython 5.5.0 compiler : GCC 8.4.0 system : Linux release : 4.19.112+ machine : x86_64 processor : x86_64 CPU cores : 2 interpreter: 64bit pandas 1.1.2 keras 2.4.3 scipy 1.4.1 matplotlib 3.2.2 tensorflow 2.3.0 scikitplot 0.3.7 seaborn 0.11.0 sklearn 0.23.2 numpy 1.18.5
def show_methods(method, ncols=3):
""" Show all the attributes of a given method.
Example:
========
show_methods(list)
"""
x = [i for i in dir(method) if i[0]!='_' ]
x = [i for i in x
if i not in 'os np pd sys time psycopg2'.split() ]
return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')
def set_random_seed(seed):
import os
import random
import numpy as np
import tensorflow as tf
os.environ['PYTHONHASHSEED']=str(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
def model_evaluation(model_name, desc, ytest, ypreds,df_eval=None,
show=True,sort_col='Recall'):
if df_eval is None:
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
sklearn.metrics.accuracy_score(ytest, ypreds),
sklearn.metrics.precision_score(ytest, ypreds, average=average),
sklearn.metrics.recall_score(ytest, ypreds, average=average),
sklearn.metrics.f1_score(ytest, ypreds, average=average),
sklearn.metrics.roc_auc_score(ytest, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
df_eval = df_eval.sort_values(sort_col)
if show:
display(df_eval.style.background_gradient(subset=[sort_col]))
return df_eval
ifile = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true"
# ifile = '../data/raw/creditcard.csv.zip'
df_raw = pd.read_csv(ifile,compression='zip')
print(df_raw.shape)
df_raw.head()
(284807, 31)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | -0.551600 | -0.617801 | -0.991390 | -0.311169 | 1.468177 | -0.470401 | 0.207971 | 0.025791 | 0.403993 | 0.251412 | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | 1.612727 | 1.065235 | 0.489095 | -0.143772 | 0.635558 | 0.463917 | -0.114805 | -0.183361 | -0.145783 | -0.069083 | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | 0.207643 | 0.624501 | 0.066084 | 0.717293 | -0.165946 | 2.345865 | -2.890083 | 1.109969 | -0.121359 | -2.261857 | 0.524980 | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | -0.054952 | -0.226487 | 0.178228 | 0.507757 | -0.287924 | -0.631418 | -1.059647 | -0.684093 | 1.965775 | -1.232622 | -0.208038 | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | 0.753074 | -0.822843 | 0.538196 | 1.345852 | -1.119670 | 0.175121 | -0.451449 | -0.237033 | -0.038195 | 0.803487 | 0.408542 | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
target = 'Class'
display(df_raw[target].value_counts())
sns.countplot(x=df_raw[target])
0 284315 1 492 Name: Class, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x7f64c7d0e898>
neg, pos = np.bincount(df_raw['Class'])
total = neg + pos
print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format(
total, pos, 100 * pos / total))
Examples: Total: 284807 Positive: 492 (0.17% of total)
cols_drop = ['Time']
df = df_raw.drop(cols_drop,axis=1)
df.shape
(284807, 30)
eps=0.001 # 0 => 0.1¢
df['Ammount'] = np.log(df.pop('Amount')+eps)
from sklearn.model_selection import train_test_split
target = 'Class'
df_Xtrain_orig,df_Xtest,ser_ytrain_orig,ser_ytest = train_test_split(df.drop([target],axis=1),
df[target],
test_size=0.2,
stratify=df[target],
random_state=SEED)
df_Xtrain,df_Xvalid,ser_ytrain,ser_yvalid = train_test_split(df_Xtrain_orig,
ser_ytrain_orig,
test_size=0.2,
stratify=ser_ytrain_orig,
random_state=SEED)
ytrain = np.array(ser_ytrain)
yvalid = np.array(ser_yvalid)
ytest = np.array(ser_ytest)
df.shape, df_Xtrain.shape, ser_ytrain.shape
((284807, 30), (182276, 29), (182276,))
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_Xtrain)
Xtrain = scaler.transform(df_Xtrain)
Xvalid = scaler.transform(df_Xvalid)
Xtest = scaler.transform(df_Xtest)
# clip the values
Xtrain = np.clip(Xtrain, -5, 5)
Xvalid = np.clip(Xvalid, -5, 5)
Xtest = np.clip(Xtest, -5, 5)
neg, pos = np.bincount(df_raw['Class'])
total = neg + pos
initial_bias = np.log([pos/neg])
initial_bias
array([-6.35935934])
weight_for_0 = (1 / neg)*(total)/2.0
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
Weight for class 0: 0.50 Weight for class 1: 289.44
n_feats = Xtrain.shape[-1]
class_weight = {0: weight_for_0, 1: weight_for_1}
#============================================================
PARAMS_MODEL = {
# layer 1
'L1_units': 16,
'L1_act': 'relu',
'L1_dropout': 0.5,
# optimizer
'adam_lr': 1e-3,
}
#============================================================
METRICS = [
keras.metrics.TruePositives(name='tp'),
keras.metrics.FalsePositives(name='fp'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.FalseNegatives(name='fn'),
keras.metrics.BinaryAccuracy(name='accuracy'),
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.AUC(name='auc'),
]
#============================================================
PARAMS_FIT = {'epochs': 100,
'batch_size': 2048,
'class_weight0': class_weight[0],
'class_weight1': class_weight[1],
'patience': 10,
'shuffle': True,
}
#============================================================
PARAMS_CLF = {'class_weight' : class_weight}
#============================================================
# callbacks
cb_early = tf.keras.callbacks.EarlyStopping(
monitor='val_auc',
verbose=1,
patience=PARAMS_FIT['patience'],
mode='max',
restore_best_weights=True)
#cb_checkpt = keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")
cb_lr = lrcurve.KerasLearningCurve()
callbacks = [cb_early, cb_lr]
def one_layer(metrics=METRICS, output_bias=None,n_feats=n_feats):
if output_bias is not None:
output_bias = tf.keras.initializers.Constant(output_bias)
model = keras.Sequential([
# layer 1
keras.layers.Dense(PARAMS_MODEL['L1_units'],
activation=PARAMS_MODEL['L1_act'],
input_shape=(n_feats,)),
keras.layers.Dropout(PARAMS_MODEL['L1_dropout']),
# last layer is dense 1 with activation sigmoid
keras.layers.Dense(1, activation='sigmoid',
bias_initializer=output_bias),
])
model.compile(
optimizer=keras.optimizers.Adam(lr=PARAMS_MODEL['adam_lr']),
loss=keras.losses.BinaryCrossentropy(),
metrics=metrics)
return model
model = one_layer()
model.summary()
Model: "sequential_11" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_22 (Dense) (None, 16) 480 _________________________________________________________________ dropout_11 (Dropout) (None, 16) 0 _________________________________________________________________ dense_23 (Dense) (None, 1) 17 ================================================================= Total params: 497 Trainable params: 497 Non-trainable params: 0 _________________________________________________________________
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
clf = KerasClassifier(one_layer,
batch_size=PARAMS_FIT['batch_size'],
epochs=PARAMS_FIT['epochs'],
class_weight=class_weight,
verbose=0)
# clf.fit accepts all sequential model.fit parameters
%%time
set_random_seed(SEED)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
scores = []
for idx_tr, idx_vd in skf.split(Xtrain, ytrain):
X_cv, y_cv = Xtrain[idx_tr], ytrain[idx_tr]
clf.fit(X_cv, y_cv)
scores.append(clf.score(X_cv, y_cv))
CPU times: user 7min 23s, sys: 27.8 s, total: 7min 51s Wall time: 4min 26s
print(scores)
[0.9883418083190918, 0.9879372715950012, 0.9879372715950012, 0.9881910085678101, 0.9894047975540161]
mean_acc = np.mean(scores)
mean_acc
0.9883624315261841
# show_methods(model)
# clf.get_params()
# clf.sk_params
# # last validation scores
# clf.fit(X_cv, y_cv)
# ypreds = clf.predict(X_cv).flatten()
# df_tmp = model_evaluation("keras", '', y_cv, ypreds,df_eval=None)
# score = clf.score(X_cv,y_cv)
# # I found that score is accuracy score
%%time
history = clf.fit(Xtrain, ytrain,
validation_data=(Xvalid,yvalid),
callbacks=[cb_early]
)
Restoring model weights from the end of the best epoch. Epoch 00023: early stopping CPU times: user 33.4 s, sys: 2.04 s, total: 35.4 s Wall time: 21.1 s
show_methods(history)
0 | 1 | 2 | |
---|---|---|---|
0 | epoch | on_predict_batch_end | on_train_batch_end |
1 | history | on_predict_begin | on_train_begin |
2 | model | on_predict_end | on_train_end |
3 | on_batch_begin | on_test_batch_begin | params |
4 | on_batch_end | on_test_batch_end | set_model |
5 | on_epoch_begin | on_test_begin | set_params |
6 | on_epoch_end | on_test_end | validation_data |
7 | on_predict_batch_begin | on_train_batch_begin |
df_history = pd.DataFrame(history.history)
df_history.head()
loss | tp | fp | tn | fn | accuracy | precision | recall | auc | val_loss | val_tp | val_fp | val_tn | val_fn | val_accuracy | val_precision | val_recall | val_auc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.683066 | 229.0 | 43589.0 | 183862.0 | 165.0 | 0.807966 | 0.005226 | 0.581218 | 0.750562 | 0.478051 | 63.0 | 5305.0 | 40185.0 | 16.0 | 0.883232 | 0.011736 | 0.797468 | 0.867813 |
1 | 0.490947 | 216.0 | 28510.0 | 153451.0 | 99.0 | 0.843046 | 0.007519 | 0.685714 | 0.815893 | 0.396563 | 64.0 | 2119.0 | 43371.0 | 15.0 | 0.953170 | 0.029317 | 0.810127 | 0.921812 |
2 | 0.390479 | 236.0 | 19219.0 | 162742.0 | 79.0 | 0.894128 | 0.012131 | 0.749206 | 0.886122 | 0.333194 | 67.0 | 1204.0 | 44286.0 | 12.0 | 0.973315 | 0.052714 | 0.848101 | 0.937796 |
3 | 0.316840 | 258.0 | 13002.0 | 168959.0 | 57.0 | 0.928356 | 0.019457 | 0.819048 | 0.933932 | 0.280624 | 68.0 | 913.0 | 44577.0 | 11.0 | 0.979723 | 0.069317 | 0.860759 | 0.949478 |
4 | 0.289923 | 265.0 | 9150.0 | 172811.0 | 50.0 | 0.949527 | 0.028147 | 0.841270 | 0.942312 | 0.239485 | 68.0 | 736.0 | 44754.0 | 11.0 | 0.983607 | 0.084577 | 0.860759 | 0.956797 |
from sklearn import metrics as skmetrics
# keras classifier has predict_proba but sequential does not
try:
yprobs = clf.predict_proba(Xtest)
except:
yprobs = clf.predict(Xtest)
yprobs[:5]
array([[4.8686218e-01, 5.1313782e-01], [9.9999642e-01, 3.5917092e-06], [9.9693179e-01, 3.0682087e-03], [9.9992031e-01, 7.9715690e-05], [9.9840224e-01, 1.5977919e-03]], dtype=float32)
if yprobs.shape[1] == 2:
yprobs = yprobs[:,1]
ypreds = (yprobs.flatten()>0.5).astype(np.int8)
ypreds[:5]
array([1, 0, 0, 0, 0], dtype=int8)
Xtest.shape, ytest.shape, ypreds.shape
((56962, 29), (56962,), (56962,))
skmetrics.confusion_matrix(ytest, ypreds)
array([[56190, 674], [ 13, 85]])
from scikitplot import metrics as skpmetrics
skpmetrics.plot_confusion_matrix(ytest,ypreds)
<matplotlib.axes._subplots.AxesSubplot at 0x7f64c8270b70>
def plot_cm(labels, predictions, p=0.5):
cm = sklearn.metrics.confusion_matrix(labels, predictions > p)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion matrix @{:.2f}'.format(p))
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
print('Total Fraudulent Transactions: ', np.sum(cm[1]))
plot_cm(ytest,ypreds)
Legitimate Transactions Detected (True Negatives): 56190 Legitimate Transactions Incorrectly Detected (False Positives): 674 Fraudulent Transactions Missed (False Negatives): 13 Fraudulent Transactions Detected (True Positives): 85 Total Fraudulent Transactions: 98
desc = "simple model"
yprobs = model.predict(Xtest)
ypreds = ypreds.ravel()
ypreds = (yprobs.flatten()>0.5).astype(np.int8)
df_eval = model_evaluation("keras", desc, ytest, ypreds,df_eval=None)
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
0 | keras | simple model | 0.987939 | 0.111989 | 0.867347 | 0.198366 | 0.927747 |
df_history.head(2)
loss | tp | fp | tn | fn | accuracy | precision | recall | auc | val_loss | val_tp | val_fp | val_tn | val_fn | val_accuracy | val_precision | val_recall | val_auc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.683066 | 229.0 | 43589.0 | 183862.0 | 165.0 | 0.807966 | 0.005226 | 0.581218 | 0.750562 | 0.478051 | 63.0 | 5305.0 | 40185.0 | 16.0 | 0.883232 | 0.011736 | 0.797468 | 0.867813 |
1 | 0.490947 | 216.0 | 28510.0 | 153451.0 | 99.0 | 0.843046 | 0.007519 | 0.685714 | 0.815893 | 0.396563 | 64.0 | 2119.0 | 43371.0 | 15.0 | 0.953170 | 0.029317 | 0.810127 | 0.921812 |
def plot_metrics(history):
metrics = ['loss', 'auc', 'precision', 'recall']
for n, metric in enumerate(metrics):
name = metric.replace("_"," ").capitalize()
plt.subplot(2,2,n+1)
plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
plt.plot(history.epoch, history.history['val_'+metric],
color=colors[0], linestyle="--", label='Val')
plt.xlabel('Epoch')
plt.ylabel(name)
if metric == 'loss':
plt.ylim([0, plt.ylim()[1]])
elif metric == 'auc':
plt.ylim([0.8,1])
else:
plt.ylim([0,1])
plt.legend()
plot_metrics(history)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))