%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install scikit-plot
!pip install lrcurve
!pip install watermark
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
TREE_METHOD = 'auto'
try:
import tensorflow as tf
has_gpu = tf.test.gpu_device_name()
TREE_METHOD = 'gpu_hist' if has_gpu else 'auto'
except:
TREE_METHOD = 'auto'
print(TREE_METHOD)
if ENV_COLAB:
!nvidia-smi
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from pprint import pprint
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import sys
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import scipy
from scipy import stats
# scale and split
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
# deep learning
import tensorflow as tf
# model evaluation
import scikitplot
from scikitplot import metrics as skmetrics
import lrcurve
from lrcurve import KerasLearningCurve
# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2021-08-08 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit numpy 1.19.5 seaborn 0.10.1 matplotlib 3.2.1 sklearn 0.23.2 tensorflow 2.5.0 scipy 1.4.1 pandas 1.1.1 scikitplot 0.3.7
def show_methods(method, ncols=3):
""" Show all the attributes of a given method.
Example:
========
show_methods(list)
"""
x = [i for i in dir(method) if i[0]!='_' ]
return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')
def set_random_seed(seed):
import os
import random
import numpy as np
import tensorflow as tf
os.environ['PYTHONHASHSEED']=str(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
def model_evaluation(model_name, desc, ytest, yprobs1d,df_eval=None,
show=True,sort_col='Recall',threshold=0.5):
if df_eval is None:
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
'AUCPR':[],
})
# make sure yprobs is float
if not isinstance(yprobs1d[0],float):
print("Make sure to use probability values.")
# assert length
assert len(ytest) == len(yprobs1d), "ytest and yprobs1d must of same length."
# prediction from probs
ypreds = (yprobs.flatten()>threshold).astype(np.int8)
# model evaluation
average = 'binary'
prec,rec,thr = sklearn.metrics.precision_recall_curve(ytest,yprobs1d)
auc_pr = sklearn.metrics.auc(rec,prec)
row_eval = [model_name,desc,
sklearn.metrics.accuracy_score(ytest, ypreds),
sklearn.metrics.precision_score(ytest, ypreds, average=average),
sklearn.metrics.recall_score(ytest, ypreds, average=average),
sklearn.metrics.f1_score(ytest, ypreds, average=average),
sklearn.metrics.roc_auc_score(ytest, yprobs1d),
auc_pr
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
df_eval = df_eval.sort_values(sort_col)
if show:
display(df_eval.style.background_gradient(subset=[sort_col]))
return df_eval
ifile = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true"
# ifile = '../data/raw/creditcard.csv.zip'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()
(284807, 31)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | -0.551600 | -0.617801 | -0.991390 | -0.311169 | 1.468177 | -0.470401 | 0.207971 | 0.025791 | 0.403993 | 0.251412 | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | 1.612727 | 1.065235 | 0.489095 | -0.143772 | 0.635558 | 0.463917 | -0.114805 | -0.183361 | -0.145783 | -0.069083 | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | 0.207643 | 0.624501 | 0.066084 | 0.717293 | -0.165946 | 2.345865 | -2.890083 | 1.109969 | -0.121359 | -2.261857 | 0.524980 | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | -0.054952 | -0.226487 | 0.178228 | 0.507757 | -0.287924 | -0.631418 | -1.059647 | -0.684093 | 1.965775 | -1.232622 | -0.208038 | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | 0.753074 | -0.822843 | 0.538196 | 1.345852 | -1.119670 | 0.175121 | -0.451449 | -0.237033 | -0.038195 | 0.803487 | 0.408542 | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
target = 'Class'
display(df[target].value_counts())
sns.countplot(x=df[target])
0 284315 1 492 Name: Class, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x7fd91523e850>
from sklearn.model_selection import train_test_split
target = 'Class'
Xtrain,Xtest,ytrain,ytest = train_test_split(df.drop([target],axis=1),
df[target],
test_size=0.2,
stratify=df[target],
random_state=SEED)
df.shape, Xtrain.shape, ytrain.shape
((284807, 31), (227845, 30), (227845,))
mean = np.mean(Xtrain, axis=0)
std = np.std(Xtrain, axis=0)
Xtrain -= mean
Xtest -= mean
Xtrain /= std
Xtest /= std
# for imbalanced data we can use class_weight
neg,pos = np.bincount(ytrain)
weight_for_0 = 1.0 / neg
weight_for_1 = 1.0 / pos
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
{0: 4.396551345124884e-06, 1: 0.0025380710659898475}
# for imbalanced data, bias_initializer help fast converging
bias_init = np.log([pos/neg])
print(bias_init)
[-6.3583392]
show_methods(tf.keras.metrics)
0 | 1 | 2 | |
---|---|---|---|
0 | AUC | MeanSquaredLogarithmicError | deserialize |
1 | Accuracy | MeanTensor | get |
2 | BinaryAccuracy | Metric | hinge |
3 | BinaryCrossentropy | Poisson | kl_divergence |
4 | CategoricalAccuracy | Precision | kld |
5 | CategoricalCrossentropy | PrecisionAtRecall | kullback_leibler_divergence |
6 | CategoricalHinge | Recall | log_cosh |
7 | CosineSimilarity | RecallAtPrecision | logcosh |
8 | FalseNegatives | RootMeanSquaredError | mae |
9 | FalsePositives | SensitivityAtSpecificity | mape |
10 | Hinge | SparseCategoricalAccuracy | mean_absolute_error |
11 | KLD | SparseCategoricalCrossentropy | mean_absolute_percentage_error |
12 | KLDivergence | SparseTopKCategoricalAccuracy | mean_squared_error |
13 | LogCoshError | SpecificityAtSensitivity | mean_squared_logarithmic_error |
14 | MAE | SquaredHinge | mse |
15 | MAPE | Sum | msle |
16 | MSE | TopKCategoricalAccuracy | poisson |
17 | MSLE | TrueNegatives | serialize |
18 | Mean | TruePositives | sparse_categorical_accuracy |
19 | MeanAbsoluteError | binary_accuracy | sparse_categorical_crossentropy |
20 | MeanAbsolutePercentageError | binary_crossentropy | sparse_top_k_categorical_accuracy |
21 | MeanIoU | categorical_accuracy | squared_hinge |
22 | MeanRelativeError | categorical_crossentropy | top_k_categorical_accuracy |
23 | MeanSquaredError |
show_methods(tf.keras.callbacks)
0 | 1 | 2 | |
---|---|---|---|
0 | BaseLogger | History | ReduceLROnPlateau |
1 | CSVLogger | LambdaCallback | RemoteMonitor |
2 | Callback | LearningRateScheduler | TensorBoard |
3 | CallbackList | ModelCheckpoint | TerminateOnNaN |
4 | EarlyStopping | ProgbarLogger | experimental |
show_methods(tf.keras.optimizers)
0 | 1 | 2 | |
---|---|---|---|
0 | Adadelta | Nadam | deserialize |
1 | Adagrad | Optimizer | get |
2 | Adam | RMSprop | schedules |
3 | Adamax | SGD | serialize |
4 | Ftrl |
n_feats = Xtrain.shape[-1]
#============================================================
PARAMS_MODEL = {
# layer 1
'L1_units': 512,
'L1_act': 'relu',
'L1_dropout': 0,
# layer 2
'L2_units': 256,
'L2_act': 'relu',
'L2_dropout': 0.3,
# layer 3
'L3_units': 128,
'L3_act': 'relu',
'L3_dropout': 0.3,
# optimizer
'adam_lr': 1e-2,
}
#============================================================
METRICS = [
tf.keras.metrics.TruePositives(name='tp'),
tf.keras.metrics.FalsePositives(name='fp'),
tf.keras.metrics.TrueNegatives(name='tn'),
tf.keras.metrics.FalseNegatives(name='fn'),
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
]
#============================================================
PARAMS_FIT = {'epochs': 50,
'batch_size': 2048,
'patience': 10,
'shuffle': True,
'validation_split': 0.2,
'class_weight': class_weight
}
#============================================================
# callbacks
cb_early = tf.keras.callbacks.EarlyStopping(
monitor='val_auc',
verbose=1,
patience=PARAMS_FIT['patience'],
mode='max',
restore_best_weights=True)
#cb_checkpt = keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")
cb_lr = lrcurve.KerasLearningCurve()
callbacks = [cb_early, cb_lr]
layers = [i for i in list(PARAMS_MODEL.keys()) if i.endswith('_units')]
print(layers)
['L1_units', 'L2_units', 'L3_units']
compile(
optimizer='rmsprop', loss=None, metrics=None, loss_weights=None,
weighted_metrics=None, run_eagerly=None, **kwargs
)
def get_model(metrics=METRICS,
bias_init=None,
n_feats=n_feats,
params=PARAMS_MODEL):
# use initial bias for imbalanced data
if bias_init is not None:
bias_init = tf.keras.initializers.Constant(bias_init)
# num of layers
n_layers = len([i for i in list(params.keys()) if i.endswith('_units')])
#===================================================== layers
model = tf.keras.Sequential(name='Sequential')
# layer 1
model.add(tf.keras.layers.Dense(
params['L1_units'],
activation=params['L1_act'],
input_shape=(n_feats,),
name='Layer_1'
))
model.add(tf.keras.layers.Dropout(params['L1_dropout'],name='Dropout_1'))
# middle layers
for i in range(2,n_layers+1): # 2,3, etc
model.add(tf.keras.layers.Dense(
params[f'L{i}_units'],
activation=params[f'L{i}_act'],
name=f'Layer_{i}'),)
model.add(tf.keras.layers.Dropout(
params[f'L{i}_dropout'],
name=f"Dropout_{i}"))
# last layer is dense 1 with activation sigmoid
# here, only last layer has bias_init, other layers have zero bias.
model.add(tf.keras.layers.Dense(
1,
activation='sigmoid',
bias_initializer=bias_init,
name=f'Layer_{n_layers+1}'
))
#=================================================== compile
model.compile(
optimizer=tf.keras.optimizers.Adam(lr=params['adam_lr']),
loss='binary_crossentropy',
metrics=metrics)
return model
# note: dropout has NO params#
model = get_model(bias_init=bias_init)
model.summary()
Model: "Sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= Layer_1 (Dense) (None, 512) 15872 _________________________________________________________________ Dropout_1 (Dropout) (None, 512) 0 _________________________________________________________________ Layer_2 (Dense) (None, 256) 131328 _________________________________________________________________ Dropout_2 (Dropout) (None, 256) 0 _________________________________________________________________ Layer_3 (Dense) (None, 128) 32896 _________________________________________________________________ Dropout_3 (Dropout) (None, 128) 0 _________________________________________________________________ Layer_4 (Dense) (None, 1) 129 ================================================================= Total params: 180,225 Trainable params: 180,225 Non-trainable params: 0 _________________________________________________________________
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:375: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead. "The `lr` argument is deprecated, use `learning_rate` instead.")
from tensorflow.keras.utils import plot_model
if not os.path.isdir('images'):
os.makedirs('images')
# save keras sequential model
plot_model(model,'images/keras_model_simple.png')
# For functional model
# plot_model(model, 'keras_model_simple.png')
fit(
x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None,
validation_split=0.0, validation_data=None, shuffle=True, class_weight=None,
sample_weight=None, initial_epoch=0, steps_per_epoch=None,
validation_steps=None, validation_batch_size=None, validation_freq=1,
max_queue_size=10, workers=1, use_multiprocessing=False
)
set_random_seed(SEED)
#========================================================= fit
model.fit(
Xtrain,
ytrain,
batch_size=PARAMS_FIT['batch_size'],
epochs=PARAMS_FIT['epochs'],
verbose=0,
callbacks=[cb_lr,cb_early],
validation_split = PARAMS_FIT['validation_split'],
class_weight=PARAMS_FIT['class_weight']
)
Restoring model weights from the end of the best epoch. Epoch 00023: early stopping
<tensorflow.python.keras.callbacks.History at 0x7fd9142c5ad0>
from sklearn import metrics as skmetrics
yprobs = model.predict(Xtest)
yprobs[:5]
array([[0.00287491], [0.00332364], [0.00261128], [0.00344023], [0.00202924]], dtype=float32)
yprobs1d = yprobs.flatten()
assert len(ytest) == len(yprobs1d)
ypreds = (yprobs1d>0.5).astype(np.int8)
ypreds[:5]
array([0, 0, 0, 0, 0], dtype=int8)
desc = "3layer, 2dropouts"
yprobs = model.predict(Xtest)
yprobs1d = yprobs.flatten()
assert len(ytest) == len(yprobs1d)
ypreds = (yprobs1d>0.5).astype(np.int8)
cm = skmetrics.confusion_matrix(ytest,ypreds)
print(cm)
df_eval = model_evaluation("keras", desc, ytest, yprobs1d,df_eval=None)
[[56194 670] [ 12 86]] Make sure to use probability values.
Model | Description | Accuracy | Precision | Recall | F1 | AUC | AUCPR | |
---|---|---|---|---|---|---|---|---|
0 | keras | 3layer, 2dropouts | 0.988027 | 0.113757 | 0.877551 | 0.201405 | 0.971723 | 0.772557 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 1 min 48 secs