Modelling classification problem using keras and tensoroflow.
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install scikit-plot
!pip install lrcurve
!pip install watermark
## print
print('Environment: Google Colaboratory.')
if ENV_COLAB:
!nvidia-smi
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from pprint import pprint
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import sys
# random state
SEED=100
np.random.seed(SEED) # we need this in each cell
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import scipy
from scipy import stats
# scale and split
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# deep learning
import tensorflow
import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# model evaluation
import scikitplot
from scikitplot import metrics as skmetrics
import lrcurve
from lrcurve import KerasLearningCurve
# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2021-02-11 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit tensorflow 2.3.1 seaborn 0.10.1 matplotlib 3.2.1 scipy 1.4.1 sklearn 0.23.2 numpy 1.18.5 scikitplot 0.3.7 keras 2.4.3 pandas 1.1.1
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def set_random_seed(seed):
import os
import random
import numpy as np
import tensorflow as tf
os.environ['PYTHONHASHSEED']=str(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
def model_evaluation(model_name, desc, ytest, ypreds,df_eval=None,
show=True,sort_col='Recall'):
if df_eval is None:
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
sklearn.metrics.accuracy_score(ytest, ypreds),
sklearn.metrics.precision_score(ytest, ypreds, average=average),
sklearn.metrics.recall_score(ytest, ypreds, average=average),
sklearn.metrics.f1_score(ytest, ypreds, average=average),
sklearn.metrics.roc_auc_score(ytest, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
df_eval = df_eval.sort_values(sort_col)
if show:
display(df_eval.style.background_gradient(subset=[sort_col]))
return df_eval
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')
print(df_train.shape)
df_train.head()
(455, 33)
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 905501 | B | 12.27 | 17.92 | 78.41 | 466.1 | 0.08685 | 0.06526 | 0.03211 | 0.02653 | 0.1966 | 0.05597 | 0.3342 | 1.7810 | 2.079 | 25.79 | 0.005888 | 0.02310 | 0.02059 | 0.010750 | 0.02578 | 0.002267 | 14.10 | 28.88 | 89.00 | 610.2 | 0.1240 | 0.1795 | 0.1377 | 0.09532 | 0.3455 | 0.06896 | NaN |
1 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.015570 | 0.01318 | 0.003892 | 18.98 | 34.12 | 126.70 | 1124.0 | 0.1139 | 0.3094 | 0.3403 | 0.14180 | 0.2218 | 0.07820 | NaN |
2 | 861103 | B | 11.45 | 20.97 | 73.81 | 401.5 | 0.11020 | 0.09362 | 0.04591 | 0.02233 | 0.1842 | 0.07005 | 0.3251 | 2.1740 | 2.077 | 24.62 | 0.010370 | 0.01706 | 0.02586 | 0.007506 | 0.01816 | 0.003976 | 13.11 | 32.16 | 84.53 | 525.1 | 0.1557 | 0.1676 | 0.1755 | 0.06127 | 0.2762 | 0.08851 | NaN |
3 | 86973702 | B | 14.44 | 15.18 | 93.97 | 640.1 | 0.09970 | 0.10210 | 0.08487 | 0.05532 | 0.1724 | 0.06081 | 0.2406 | 0.7394 | 2.120 | 21.20 | 0.005706 | 0.02297 | 0.03114 | 0.014930 | 0.01454 | 0.002528 | 15.85 | 19.85 | 108.60 | 766.9 | 0.1316 | 0.2735 | 0.3103 | 0.15990 | 0.2691 | 0.07683 | NaN |
4 | 8810703 | M | 28.11 | 18.47 | 188.50 | 2499.0 | 0.11420 | 0.15160 | 0.32010 | 0.15950 | 0.1648 | 0.05525 | 2.8730 | 1.4760 | 21.980 | 525.60 | 0.013450 | 0.02772 | 0.06389 | 0.014070 | 0.04783 | 0.004476 | 28.11 | 18.47 | 188.50 | 2499.0 | 0.1142 | 0.1516 | 0.3201 | 0.15950 | 0.1648 | 0.05525 | NaN |
target = 'diagnosis'
display(df_train[target].value_counts())
sns.countplot(x=df_train[target])
B 285 M 170 Name: diagnosis, dtype: int64
<matplotlib.axes._subplots.AxesSubplot at 0x7f8b730af950>
cols_drop = ['id','Unnamed: 32' ]
df_train = df_train.drop(cols_drop, axis=1)
df_test = df_test.drop(cols_drop, axis=1)
df_train['diagnosis'] = df_train['diagnosis'].map({'B': 0, 'M': 1})
df_test['diagnosis'] = df_test['diagnosis'].map({'B': 0, 'M': 1})
Xtrain = df_train.drop(target,axis=1).values
ytrain = df_train[target].values.astype(np.int8)
Xtest = df_test.drop(target,axis=1).values
ytest = df_test[target].values.astype(np.int8)
ytrain[:5]
array([0, 1, 0, 0, 1], dtype=int8)
df_train.head(2)
diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 12.27 | 17.92 | 78.41 | 466.1 | 0.08685 | 0.06526 | 0.03211 | 0.02653 | 0.1966 | 0.05597 | 0.3342 | 1.781 | 2.079 | 25.79 | 0.005888 | 0.02310 | 0.02059 | 0.01075 | 0.02578 | 0.002267 | 14.10 | 28.88 | 89.0 | 610.2 | 0.1240 | 0.1795 | 0.1377 | 0.09532 | 0.3455 | 0.06896 |
1 | 1 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.075 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.01557 | 0.01318 | 0.003892 | 18.98 | 34.12 | 126.7 | 1124.0 | 0.1139 | 0.3094 | 0.3403 | 0.14180 | 0.2218 | 0.07820 |
mean = np.mean(Xtrain, axis=0)
std = np.std(Xtrain, axis=0)
Xtrain -= mean
Xtest -= mean
Xtrain /= std
Xtest /= std
show_methods(keras.metrics)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | AUC | MSLE | SparseCategoricalAccuracy | kld |
1 | Accuracy | Mean | SparseCategoricalCrossentropy | kullback_leibler_divergence |
2 | BinaryAccuracy | MeanAbsoluteError | SparseTopKCategoricalAccuracy | mae |
3 | BinaryCrossentropy | MeanAbsolutePercentageError | SpecificityAtSensitivity | mape |
4 | CategoricalAccuracy | MeanIoU | SquaredHinge | mean_absolute_error |
5 | CategoricalCrossentropy | MeanRelativeError | Sum | mean_absolute_percentage_error |
6 | CategoricalHinge | MeanSquaredError | TopKCategoricalAccuracy | mean_squared_error |
7 | CosineSimilarity | MeanSquaredLogarithmicError | TrueNegatives | mean_squared_logarithmic_error |
8 | FalseNegatives | MeanTensor | TruePositives | mse |
9 | FalsePositives | Metric | binary_accuracy | msle |
10 | Hinge | Poisson | binary_crossentropy | poisson |
11 | KLD | Precision | categorical_accuracy | serialize |
12 | KLDivergence | PrecisionAtRecall | categorical_crossentropy | sparse_categorical_accuracy |
13 | LogCoshError | Recall | deserialize | sparse_categorical_crossentropy |
14 | MAE | RecallAtPrecision | get | sparse_top_k_categorical_accuracy |
15 | MAPE | RootMeanSquaredError | hinge | squared_hinge |
16 | MSE | SensitivityAtSpecificity | kl_divergence | top_k_categorical_accuracy |
show_methods(keras.callbacks)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | BaseLogger | EarlyStopping | ModelCheckpoint | TensorBoard |
1 | CSVLogger | History | ProgbarLogger | TerminateOnNaN |
2 | Callback | LambdaCallback | ReduceLROnPlateau | experimental |
3 | CallbackList | LearningRateScheduler | RemoteMonitor |
show_methods(keras.optimizers)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Adadelta | Ftrl | RMSprop | get |
1 | Adagrad | Nadam | SGD | schedules |
2 | Adam | Optimizer | deserialize | serialize |
3 | Adamax |
np.random.seed(SEED)
# sequential model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units = 8, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 4, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 2, activation = 'relu'))
# output layer
model.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
# compile
model.compile(optimizer = 'adam',
loss = 'binary_crossentropy' ,
metrics = ['accuracy'] )
# fit
history = model.fit(Xtrain, ytrain, batch_size = 16, epochs = 100, verbose=0)
THR = 0.9
yprobs = model.predict(Xtest).ravel()
ypreds = (yprobs > THR).astype(np.int8)
yprobs[:5], ypreds[:5]
(array([3.81493419e-01, 1.08437955e-01, 2.90197134e-03, 9.13023949e-04, 3.68003966e-05], dtype=float32), array([0, 0, 0, 0, 0], dtype=int8))
from sklearn import metrics as skmetrics
desc = "8-4-2-1"
cm = skmetrics.confusion_matrix(ytest,ypreds)
print(cm)
df_eval = model_evaluation("keras", desc, ytest, ypreds,df_eval=None)
[[72 0] [ 3 39]]
Model | Description | Accuracy | Precision | Recall | F1 | AUC | |
---|---|---|---|---|---|---|---|
0 | keras | 8-4-2-1 | 0.973684 | 1.000000 | 0.928571 | 0.962963 | 0.964286 |
import scikitplot.metrics as skpmetrics
skpmetrics.plot_confusion_matrix(ytest,ypreds)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8b736b7e90>
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 3 secs