Modelling classification problem using keras and tensoroflow.
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install scikit-plot
!pip install lrcurve
!pip install watermark
## print
print('Environment: Google Colaboratory.')
if ENV_COLAB:
!nvidia-smi
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from pprint import pprint
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import sys
# random state
SEED=100
np.random.seed(SEED) # we need this in each cell
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import scipy
from scipy import stats
# scale and split
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
# deep learning
import tensorflow
import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
# model evaluation
import scikitplot
from scikitplot import metrics as skmetrics
import lrcurve
from lrcurve import KerasLearningCurve
# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2021-02-11 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit scipy 1.4.1 keras 2.4.3 pandas 1.1.1 matplotlib 3.2.1 sklearn 0.23.2 tensorflow 2.3.1 seaborn 0.10.1 numpy 1.18.5 scikitplot 0.3.7
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def set_random_seed(seed):
import os
import random
import numpy as np
import tensorflow as tf
os.environ['PYTHONHASHSEED']=str(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
def model_evaluation(model_name, desc, ytest, ypreds,df_eval=None,
show=True,sort_col='Recall'):
if df_eval is None:
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
sklearn.metrics.accuracy_score(ytest, ypreds),
sklearn.metrics.precision_score(ytest, ypreds, average=average),
sklearn.metrics.recall_score(ytest, ypreds, average=average),
sklearn.metrics.f1_score(ytest, ypreds, average=average),
sklearn.metrics.roc_auc_score(ytest, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
df_eval = df_eval.sort_values(sort_col)
if show:
display(df_eval.style.background_gradient(subset=[sort_col]))
return df_eval
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')
print(df_train.shape)
df_train.head()
(455, 33)
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 905501 | B | 12.27 | 17.92 | 78.41 | 466.1 | 0.08685 | 0.06526 | 0.03211 | 0.02653 | 0.1966 | 0.05597 | 0.3342 | 1.7810 | 2.079 | 25.79 | 0.005888 | 0.02310 | 0.02059 | 0.010750 | 0.02578 | 0.002267 | 14.10 | 28.88 | 89.00 | 610.2 | 0.1240 | 0.1795 | 0.1377 | 0.09532 | 0.3455 | 0.06896 | NaN |
1 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.015570 | 0.01318 | 0.003892 | 18.98 | 34.12 | 126.70 | 1124.0 | 0.1139 | 0.3094 | 0.3403 | 0.14180 | 0.2218 | 0.07820 | NaN |
2 | 861103 | B | 11.45 | 20.97 | 73.81 | 401.5 | 0.11020 | 0.09362 | 0.04591 | 0.02233 | 0.1842 | 0.07005 | 0.3251 | 2.1740 | 2.077 | 24.62 | 0.010370 | 0.01706 | 0.02586 | 0.007506 | 0.01816 | 0.003976 | 13.11 | 32.16 | 84.53 | 525.1 | 0.1557 | 0.1676 | 0.1755 | 0.06127 | 0.2762 | 0.08851 | NaN |
3 | 86973702 | B | 14.44 | 15.18 | 93.97 | 640.1 | 0.09970 | 0.10210 | 0.08487 | 0.05532 | 0.1724 | 0.06081 | 0.2406 | 0.7394 | 2.120 | 21.20 | 0.005706 | 0.02297 | 0.03114 | 0.014930 | 0.01454 | 0.002528 | 15.85 | 19.85 | 108.60 | 766.9 | 0.1316 | 0.2735 | 0.3103 | 0.15990 | 0.2691 | 0.07683 | NaN |
4 | 8810703 | M | 28.11 | 18.47 | 188.50 | 2499.0 | 0.11420 | 0.15160 | 0.32010 | 0.15950 | 0.1648 | 0.05525 | 2.8730 | 1.4760 | 21.980 | 525.60 | 0.013450 | 0.02772 | 0.06389 | 0.014070 | 0.04783 | 0.004476 | 28.11 | 18.47 | 188.50 | 2499.0 | 0.1142 | 0.1516 | 0.3201 | 0.15950 | 0.1648 | 0.05525 | NaN |
target = 'diagnosis'
display(df_train[target].value_counts())
sns.countplot(x=df_train[target])
B 285 M 170 Name: diagnosis, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fcf6627d0>
cols_drop = ['id','Unnamed: 32' ]
df_train = df_train.drop(cols_drop, axis=1)
df_test = df_test.drop(cols_drop, axis=1)
df_train['diagnosis'] = df_train['diagnosis'].map({'B': 0, 'M': 1})
df_test['diagnosis'] = df_test['diagnosis'].map({'B': 0, 'M': 1})
Xtrain = df_train.drop(target,axis=1).values
ytrain = df_train[target].values.astype(np.int8)
Xtest = df_test.drop(target,axis=1).values
ytest = df_test[target].values.astype(np.int8)
ytrain[:5]
array([0, 1, 0, 0, 1], dtype=int8)
mean = np.mean(Xtrain, axis=0)
std = np.std(Xtrain, axis=0)
Xtrain -= mean
Xtest -= mean
Xtrain /= std
Xtest /= std
# for imbalanced data we can use class_weight
neg,pos = np.bincount(ytrain)
weight_for_0 = 1.0 / neg
weight_for_1 = 1.0 / pos
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
{0: 0.0035087719298245615, 1: 0.0058823529411764705}
# for imbalanced data, bias_initializer help fast converging
bias_init = np.log([pos/neg])
print(bias_init)
[-0.51669074]
show_methods(keras.metrics)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | AUC | MSLE | SparseCategoricalAccuracy | kld |
1 | Accuracy | Mean | SparseCategoricalCrossentropy | kullback_leibler_divergence |
2 | BinaryAccuracy | MeanAbsoluteError | SparseTopKCategoricalAccuracy | mae |
3 | BinaryCrossentropy | MeanAbsolutePercentageError | SpecificityAtSensitivity | mape |
4 | CategoricalAccuracy | MeanIoU | SquaredHinge | mean_absolute_error |
5 | CategoricalCrossentropy | MeanRelativeError | Sum | mean_absolute_percentage_error |
6 | CategoricalHinge | MeanSquaredError | TopKCategoricalAccuracy | mean_squared_error |
7 | CosineSimilarity | MeanSquaredLogarithmicError | TrueNegatives | mean_squared_logarithmic_error |
8 | FalseNegatives | MeanTensor | TruePositives | mse |
9 | FalsePositives | Metric | binary_accuracy | msle |
10 | Hinge | Poisson | binary_crossentropy | poisson |
11 | KLD | Precision | categorical_accuracy | serialize |
12 | KLDivergence | PrecisionAtRecall | categorical_crossentropy | sparse_categorical_accuracy |
13 | LogCoshError | Recall | deserialize | sparse_categorical_crossentropy |
14 | MAE | RecallAtPrecision | get | sparse_top_k_categorical_accuracy |
15 | MAPE | RootMeanSquaredError | hinge | squared_hinge |
16 | MSE | SensitivityAtSpecificity | kl_divergence | top_k_categorical_accuracy |
show_methods(keras.callbacks)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | BaseLogger | EarlyStopping | ModelCheckpoint | TensorBoard |
1 | CSVLogger | History | ProgbarLogger | TerminateOnNaN |
2 | Callback | LambdaCallback | ReduceLROnPlateau | experimental |
3 | CallbackList | LearningRateScheduler | RemoteMonitor |
show_methods(keras.optimizers)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Adadelta | Ftrl | RMSprop | get |
1 | Adagrad | Nadam | SGD | schedules |
2 | Adam | Optimizer | deserialize | serialize |
3 | Adamax |
n_feats = Xtrain.shape[-1]
#============================================================
PARAMS_MODEL = {
# layer 1
'L1_units': 512,
'L1_act': 'relu',
'L1_dropout': 0,
# layer 2
'L2_units': 256,
'L2_act': 'relu',
'L2_dropout': 0.3,
# layer 3
'L3_units': 128,
'L3_act': 'relu',
'L3_dropout': 0.3,
# optimizer
'adam_lr': 1e-2,
}
#============================================================
METRICS = [
keras.metrics.TruePositives(name='tp'),
keras.metrics.FalsePositives(name='fp'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.FalseNegatives(name='fn'),
keras.metrics.BinaryAccuracy(name='accuracy'),
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.AUC(name='auc'),
]
#============================================================
PARAMS_FIT = {'epochs': 50,
'batch_size': 2048,
'patience': 10,
'shuffle': True,
'validation_split': 0.2,
'class_weight': class_weight
}
#============================================================
# callbacks
cb_early = tf.keras.callbacks.EarlyStopping(
monitor='val_auc',
verbose=1,
patience=PARAMS_FIT['patience'],
mode='max',
restore_best_weights=True)
#cb_checkpt = keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")
cb_lr = lrcurve.KerasLearningCurve()
callbacks = [cb_early, cb_lr]
layers = [i for i in list(PARAMS_MODEL.keys()) if i.endswith('_units')]
print(layers)
['L1_units', 'L2_units', 'L3_units']
compile(
optimizer='rmsprop', loss=None, metrics=None, loss_weights=None,
weighted_metrics=None, run_eagerly=None, **kwargs
)
def get_model(metrics=METRICS,
bias_init=None,
n_feats=n_feats,
params=PARAMS_MODEL):
# use initial bias for imbalanced data
if bias_init is not None:
bias_init = tf.keras.initializers.Constant(bias_init)
# num of layers
n_layers = len([i for i in list(params.keys()) if i.endswith('_units')])
#===================================================== layers
model = keras.Sequential(name='Sequential')
# layer 1
model.add(keras.layers.Dense(
params['L1_units'],
activation=params['L1_act'],
input_shape=(n_feats,),
name='Layer_1'
))
model.add(keras.layers.Dropout(params['L1_dropout'],name='Dropout_1'))
# middle layers
for i in range(2,n_layers+1): # 2,3, etc
model.add(keras.layers.Dense(
params[f'L{i}_units'],
activation=params[f'L{i}_act'],
name=f'Layer_{i}'),)
model.add(keras.layers.Dropout(
params[f'L{i}_dropout'],
name=f"Dropout_{i}"))
# last layer is dense 1 with activation sigmoid
model.add(keras.layers.Dense(
1,
activation='sigmoid',
bias_initializer=bias_init,
name=f'Layer_{n_layers+1}'
))
#=================================================== compile
model.compile(
optimizer=keras.optimizers.Adam(lr=params['adam_lr']),
loss=keras.losses.BinaryCrossentropy(),
metrics=metrics)
return model
# note: dropout has NO params#
model = get_model(bias_init=bias_init)
model.summary()
Model: "Sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= Layer_1 (Dense) (None, 512) 15872 _________________________________________________________________ Dropout_1 (Dropout) (None, 512) 0 _________________________________________________________________ Layer_2 (Dense) (None, 256) 131328 _________________________________________________________________ Dropout_2 (Dropout) (None, 256) 0 _________________________________________________________________ Layer_3 (Dense) (None, 128) 32896 _________________________________________________________________ Dropout_3 (Dropout) (None, 128) 0 _________________________________________________________________ Layer_4 (Dense) (None, 1) 129 ================================================================= Total params: 180,225 Trainable params: 180,225 Non-trainable params: 0 _________________________________________________________________
fit(
x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None,
validation_split=0.0, validation_data=None, shuffle=True, class_weight=None,
sample_weight=None, initial_epoch=0, steps_per_epoch=None,
validation_steps=None, validation_batch_size=None, validation_freq=1,
max_queue_size=10, workers=1, use_multiprocessing=False
)
set_random_seed(SEED)
#========================================================= fit
model.fit(
Xtrain,
ytrain,
batch_size=PARAMS_FIT['batch_size'],
epochs=PARAMS_FIT['epochs'],
verbose=0,
callbacks=[cb_lr,cb_early],
validation_split = PARAMS_FIT['validation_split'],
class_weight=PARAMS_FIT['class_weight']
)
Restoring model weights from the end of the best epoch. Epoch 00017: early stopping
<tensorflow.python.keras.callbacks.History at 0x7f9fd047b190>
from sklearn import metrics as skmetrics
yprobs = model.predict(Xtest)
yprobs[:5]
array([[9.8101878e-01], [1.6468227e-02], [4.3413341e-03], [4.4047832e-04], [8.7010011e-07]], dtype=float32)
ypreds = (yprobs.flatten()>0.5).astype(np.int8)
ypreds[:5]
array([1, 0, 0, 0, 0], dtype=int8)
desc = "3layer, 2dropouts"
yprobs = model.predict(Xtest)
ypreds = ypreds.ravel()
ypreds = (yprobs.flatten()>0.5).astype(np.int8)
cm = skmetrics.confusion_matrix(ytest,ypreds)
print(cm)
df_eval = model_evaluation("keras", desc, ytest, ypreds,df_eval=None)
[[68 4] [ 0 42]]
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
0 | keras | 3layer, 2dropouts | 0.964912 | 0.913043 | 1.000000 | 0.954545 | 0.972222 |
import scikitplot.metrics as skpmetrics
skpmetrics.plot_confusion_matrix(ytest,ypreds)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fd3d5d3d0>
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 1 min 19 secs