import time
time_start_notebook = time.time()

import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

# six and pickle
import six
import pickle
import joblib

# ml
import sklearn

# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# grid search
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

# pipeline
from sklearn.pipeline import make_pipeline

# cross validations

#------------------
# cross_val_score(lasso, X, y, cv=5,n_jobs=-1,scoring='r2')
# cross_val_score(clf,   X, y, cv=5,n_jobs=-1,scoring='recall')
from sklearn.model_selection import cross_val_score

#------------------
# cross_val_predict may differ from cross_validate and cross_val_score
# cross_val_predict can be used for plotting.
# ypreds = cross_val_predict(lasso, X, y, cv=5,n_jobs=-1,scoring='r2')
# ypreds = cross_val_predict(clf,   X, y, cv=5,n_jobs=-1,scoring='recall')
from sklearn.model_selection import cross_val_predict

#------------------
# cv_results = cross_validate(lasso, X, y, cv=5,n_jobs=-1,scoring='r2')
# print(cv_results['test_score'])
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import cross_validate

# sklearn scalar metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# multiple metrics
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_fscore_support

# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2021-08-10 

CPython 3.7.7
IPython 7.22.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

joblib     1.0.1
autopep8   1.5.2
matplotlib 3.2.1
six        1.15.0
numpy      1.19.5
sklearn    0.23.1
pandas     1.3.0
json       2.0.9
seaborn    0.11.0


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
from bhishan import bp


def get_profit(y_true, y_pred):
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return profit

scoring = sklearn.metrics.make_scorer(get_profit, greater_is_better=True)


df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()

(284807, 31)


target = 'Class'
df[target].value_counts()

0    284315
1       492
Name: Class, dtype: int64


df[target].value_counts(normalize=True)*1000

0    998.272514
1      1.727486
Name: Class, dtype: float64


# RobustScaler is less prone to outliers.
from sklearn.preprocessing import StandardScaler, RobustScaler

scaler = RobustScaler()

df['scaled_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = scaler.fit_transform(df['Time'].values.reshape(-1,1))


# without removing outliers
n = df[target].value_counts().values[-1]

df_under = (df.groupby(target)
                .apply(lambda x: x.sample(n,random_state=SEED))
                .reset_index(drop=True)
               )

df_under[target].value_counts()

0    492
1    492
Name: Class, dtype: int64


df.shape, df_under.shape
# out of 284k samples, we now have 984 samples for undersampling
# we have lost 283k samples and have only 1k samples
# this is a lot of information losss, but still I will test the 
# classifiers with this undersampling method.
#
# Later, I will use oversampling methods to do the modelling.

((284807, 33), (984, 33))


df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class', 'scaled_amount', 'scaled_time'],
      dtype='object')


(Xtrain_under, Xtest_under,
 ytrain_under, ytest_under) = \
    train_test_split(df_under.drop([target],1),
                     df_under[target],
                     random_state=SEED,
                     test_size=0.2,
                     #stratify=df_under[target] # do no use stratify here.
                     )

print(df.shape, Xtrain_under.shape, Xtest_under.shape)
columns = df.columns.difference([target]).values.tolist() + [target]

df_train_under = pd.DataFrame(data=np.c_[Xtrain_under,
                                         ytrain_under],
                              columns=columns)

df_test_under = pd.DataFrame(data=np.c_[Xtest_under
                                        ,ytest_under],
                             columns=columns)

print(df.shape, df_train_under.shape, df_test_under.shape)
df_train_under.head(2)

(284807, 33) (787, 32) (197, 32)
(284807, 33) (787, 33) (197, 33)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning:

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


df_train_under[target].value_counts()

0.0    401
1.0    386
Name: Class, dtype: int64


df_test_under[target].value_counts()

1.0    106
0.0     91
Name: Class, dtype: int64


for x in [df, df_under,
          df_train_under, df_test_under,
         ]:
    print(x.isnull().sum().sum())

0
0
0
0


df_under.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class', 'scaled_amount', 'scaled_time'],
      dtype='object')


df_train_under.columns

Index(['Amount', 'Time', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16',
       'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25',
       'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'scaled_amount', 'scaled_time', 'Class'],
      dtype='object')


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.calibration import CalibratedClassifierCV


features_with_log = df_under.columns.difference(
    ['Amount','Time','Class']).values.tolist()

features = features_with_log
print(features)

['V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'scaled_amount', 'scaled_time']


# numpy arrays
Xtrain = df_train_under[features].values
Xtest = df_test_under[features].values

ytrain = df_train_under[target].values.ravel()
ytest = df_test_under[target].values.ravel()

Xtrain.shape, ytrain.shape, Xtest.shape,  ytest.shape

((787, 30), (787,), (197, 30), (197,))


clf_lr = LogisticRegression(solver='liblinear',
                            max_iter=4000,
                            random_state=SEED,
                            n_jobs=1,) # for liblinear n_jobs is +1.

clf_svc = SVC(random_state=SEED,gamma='scale')
clf_knn = KNeighborsClassifier(n_jobs=-1)
clf_dtc = DecisionTreeClassifier(random_state=SEED)
clf_rfc = RandomForestClassifier(n_estimators=100,
                                random_state=SEED,n_jobs=-1)

clf_lr_cali = CalibratedClassifierCV(clf_svc, method='sigmoid', cv=5)
clf_svc_cali = CalibratedClassifierCV(clf_svc, method='sigmoid', cv=5)
clf_knn_cali = CalibratedClassifierCV(clf_svc, method='sigmoid', cv=5)
clf_dtc_cali = CalibratedClassifierCV(clf_svc, method='sigmoid', cv=5)
clf_rfc_cali = CalibratedClassifierCV(clf_svc, method='sigmoid', cv=5)

clf_names = ["Logisitic Regression","Support Vector Classifier",
            "KNN", "Decision Tree Classifier","Random Forest Classifier",
            ]

clf_names +=  ["Calibrated " + i for i in clf_names]
clf_names

['Logisitic Regression',
 'Support Vector Classifier',
 'KNN',
 'Decision Tree Classifier',
 'Random Forest Classifier',
 'Calibrated Logisitic Regression',
 'Calibrated Support Vector Classifier',
 'Calibrated KNN',
 'Calibrated Decision Tree Classifier',
 'Calibrated Random Forest Classifier']


X = df_under[features_with_log].values
y = df_under[target].values.ravel()


from sklearn.model_selection import cross_val_score

classifiers = [clf_lr, clf_svc, clf_knn, clf_dtc, clf_rfc,
               clf_lr_cali, clf_svc_cali, clf_knn_cali, clf_dtc_cali, clf_rfc_cali]
recall_cross_val_scores = []

t0 = time.time()
for clf in classifiers:
    clf.fit(X, y)
    score = cross_val_score(clf, X, y, cv=5,n_jobs=-1,scoring='recall')
    recall_cross_val_scores.append(score.mean())
    
t1 = (time.time() - t0)

print('Time taken: {} minutes {:.2f} seconds'.format(*divmod(t1,60)))
    
df_recall_cross_val_scores = pd.DataFrame({'Classifier': clf_names,
                'Recall Cross Validation Score': recall_cross_val_scores})

df_recall_cross_val_scores = df_recall_cross_val_scores\
    .sort_values('Recall Cross Validation Score',ascending=False)

df_recall_cross_val_scores.index = range(len(df_recall_cross_val_scores))

df_recall_cross_val_scores.style.background_gradient(
    subset=['Recall Cross Validation Score']).set_caption(
    'Recall Cross Validation Scores for Default Classifiers for Undersampled Data')

Time taken: 0.0 minutes 4.35 seconds


from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV


model = SVC()
model

SVC()


def uncalibrated(Xtrain, Xtest, ytrain):
    # fit a model
    model = SVC(random_state=SEED,gamma='scale')
    model.fit(Xtrain, ytrain)
    
    # predict probabilities
    return model.decision_function(Xtest)


def calibrated(Xtrain, Xtest, ytrain):
    # define model
    model = SVC(random_state=SEED,gamma='scale')
    
    # define and fit calibration model
    calibrated = CalibratedClassifierCV(model, method='sigmoid', cv=5)
    calibrated.fit(Xtrain, ytrain)
    
    # predict probabilities
    return calibrated.predict_proba(Xtest)[:, 1]


# uncalibrated predictions
ypreds = uncalibrated(Xtrain, Xtest, ytrain)


# calibrated predictions
ypreds_cal = calibrated(Xtrain, Xtest, ytrain)


# reliability diagrams
from sklearn.calibration import calibration_curve

fop, mpv = calibration_curve(ytest,ypreds,n_bins=10,normalize=True)

fop_cal, mpv_cal = calibration_curve(ytest,ypreds_cal,n_bins=10)

# plot perfectly calibrated
plt.plot([0, 1], [0, 1], linestyle='--', color='black')

# plot model reliabilities
plt.plot(mpv, fop, marker='.')
plt.plot(mpv_cal, fop_cal, marker='.')
plt.show()


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 13 secs


import subprocess
subprocess.call(['python', '-m', 'nbconvert', '*.ipynb'])

1


!mv *.html ../html/

mv: rename *.html to ../html/: No such file or directory


!rm -r catboost_info

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	0.624501	0.066084	0.717293	-0.165946	2.345865	-2.890083	1.109969	-0.121359	-2.261857	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	-0.226487	0.178228	0.507757	-0.287924	-0.631418	-1.059647	-0.684093	1.965775	-1.232622	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	-0.822843	0.538196	1.345852	-1.119670	0.175121	-0.451449	-0.237033	-0.038195	0.803487	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Amount	Time	V1	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V2	V20	V21	V22	V23	V24	V25	V26	V27	V28	V3	V4	V5	V6	V7	V8	V9	scaled_amount	scaled_time	Class
0	29785.0	0.923764	0.344048	-2.880004	1.721680	-3.019565	-0.639736	-3.801325	1.299096	0.864065	-2.895252	3.028162	-2.549177	-1.560432	-2.971317	1.078895	-4.702012	-4.908099	-1.508873	3.001685	0.170872	0.899931	1.481271	0.725266	0.176960	-1.815638	-0.536517	0.489035	-0.049729	30.30	0.115978	-0.645062	1.0
1	146398.0	1.790673	-0.803475	-0.869963	0.154827	-0.292277	0.350580	-0.589416	0.183619	1.126239	0.034143	0.177329	0.439569	-0.480873	0.296339	0.473388	0.738279	-1.134799	1.134228	0.130572	0.024448	0.277790	0.621435	-0.002887	0.090027	-0.132452	-0.267803	-0.001150	-0.027962	118.37	1.346608	0.724938	0.0

Table of Contents

Imports¶

Useful Functions¶

Load the data¶

Preprocessing¶

Class Balance¶

Scaling¶

Random Under Sampling¶

Train Test split with stratify for imbalanced data¶

Check for nans before modelling¶

Modelling¶

Classifiers¶

Recall Scores from Cross Validation¶

SVC with Calibrated Probabilities¶

Run Time¶

	Classifier	Recall Cross Validation Score
0	Decision Tree Classifier	0.928798
1	Logisitic Regression	0.912575
2	Random Forest Classifier	0.902412
3	KNN	0.898330
4	Calibrated Logisitic Regression	0.892208
5	Calibrated Support Vector Classifier	0.892208
6	Calibrated KNN	0.892208
7	Calibrated Decision Tree Classifier	0.892208
8	Calibrated Random Forest Classifier	0.892208
9	Support Vector Classifier	0.869841