%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    #!pip install hpsklearn
    !pip install shap eli5 lime scikit-plot watermark
    !pip install optuna hyperopt
    !pip install catboost
    !pip install ipywidgets
    !pip install -U scikit-learn
    !jupyter nbextension enable --py widgetsnbextension

    # create project like folders
    !mkdir -p ../outputs ../images ../reports ../html ../models

    print('Environment: Google Colab')


import time

notebook_start_time = time.time()


import numpy as np
import pandas as pd

pd.set_option('max_columns',100)

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# visualizatioin
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)

# six and pickle
import six
import pickle
import joblib

# mixed
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)

# sklearn
import sklearn

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# sklearn scalar metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# boosting
import xgboost, lightgbm, catboost
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier, DMatrix
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier, Pool, CatBoost

# parameters tuning
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample

# model intepretation modules
import eli5
import shap
import yellowbrick
import lime
import scikitplot

# version
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.metrics.scorer module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
  warnings.warn(message, FutureWarning)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.feature_selection.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.
  warnings.warn(message, FutureWarning)

Bhishan Poudel 2021-08-09 

CPython 3.7.7
IPython 7.22.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

six         1.15.0
sklearn     0.23.1
shap        0.39.0
yellowbrick 1.1
joblib      1.0.1
eli5        0.10.1
lightgbm    2.3.1
catboost    0.23.2
scikitplot  0.3.7
xgboost     1.2.0
numpy       1.19.5
seaborn     0.11.0
autopep8    1.5.2
json        2.0.9
pandas      1.3.0

The sklearn.metrics.classification module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


def get_profit(y_true, y_pred):
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return profit

# scoring = sklearn.metrics.make_scorer(get_profit, greater_is_better=True)

#=========== for catboost
class ProfitMetric:

    @staticmethod
    def get_profit(y_true, y_pred):
        from scipy.special import expit

        y_pred = expit(y_pred).astype(int)
        y_true = y_true.astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        loss = 400*tp - 200*fn - 100*fp
        return loss
    
    def is_max_optimal(self):
        return True # greater is better

    def evaluate(self, approxes, target, weight):            
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        y_true = np.array(target).astype(int)
        approx = approxes[0]
        score = self.get_profit(y_true, approx)
        return score, 1

    def get_final_error(self, error, weight):
        return error

# model = CatBoostClassifier(metric_period=50,
#   n_estimators=200,
#   eval_metric=ProfitMetric()
# )

# model.fit(X, y, eval_set=(X_test, y_test))


def model_evaluation(model_name, desc, ser_ytest, yprobs1d,
                     df_eval=None,threshold=0.5,
                     show=True,col_sort='Recall'):
    if df_eval is None:
        df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC':[],
                        'AUCPR':[],
                    })

    y_true = np.array(ser_ytest).flatten()
    prec,rec,thr = sklearn.metrics.precision_recall_curve(y_true,yprobs1d)
    auc_pr = sklearn.metrics.auc(rec,prec)
    y_pred = (yprobs1d > threshold).astype(np.int8)

    # model evaluation
    average = 'binary'
    row_eval = [model_name,desc, 
                sklearn.metrics.accuracy_score(y_true, y_pred),
                sklearn.metrics.precision_score(y_true, y_pred, average=average),
                sklearn.metrics.recall_score(y_true, y_pred, average=average),
                sklearn.metrics.f1_score(y_true, y_pred, average=average),
                sklearn.metrics.roc_auc_score(y_true, yprobs1d), # auc need probs
                auc_pr
                ]

    df_eval.loc[len(df_eval)] = row_eval
    df_eval = df_eval.drop_duplicates()
    df_eval = df_eval.sort_values(col_sort,ascending=False)

    if show:
        display(df_eval.style.background_gradient(subset=[col_sort]))

    return df_eval

df_eval = None


ifile = 'https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()

(284807, 31)


target = 'Class'
features = df.columns.drop(target)
df[target].value_counts(normalize=True)*100

0    99.827251
1     0.172749
Name: Class, dtype: float64


sns.countplot(x=df[target])

<matplotlib.axes._subplots.AxesSubplot at 0x7f93e2f94550>


from sklearn.model_selection import train_test_split

df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
    df.drop(target,axis=1), 
    df[target],
    test_size=0.2, 
    random_state=SEED, 
    stratify=df[target])

ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

print(df_Xtrain_orig.shape)
df_Xtrain_orig.head()

(227845, 30)


df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_orig, 
    ser_ytrain_orig,
    test_size=0.2, 
    random_state=SEED, 
    stratify=ser_ytrain_orig)

ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()

print(df_Xtrain.shape)

(182276, 30)


%%time
model = CatBoostClassifier(verbose=100,random_state=SEED,
                          eval_metric=ProfitMetric())
model.fit(df_Xtrain,ser_ytrain)

ypreds = model.predict(df_Xtest)

cm = sklearn.metrics.confusion_matrix(np.array(ser_ytest),ypreds)
print('confusion matrix\n',cm)

confusion matrix
 [[56859     5]
 [   24    74]]
CPU times: user 152 ms, sys: 3.76 ms, total: 156 ms
Wall time: 168 ms


profit = get_profit(ser_ytest, ypreds)
print(f"profit = ${profit:,}")

profit = $24,300


yprobs = model.predict_proba(df_Xtest)
print(yprobs[:5])

[[9.99998192e-01 1.80796544e-06]
 [9.99932577e-01 6.74226697e-05]
 [9.99998670e-01 1.32969613e-06]
 [9.99996711e-01 3.28938804e-06]
 [9.99994773e-01 5.22686104e-06]]


yprobs1d = yprobs[:,1]


df_eval = model_evaluation('xgb', 'custom loss', ser_ytest, yprobs1d)


from scikitplot import metrics as skpmetrics

skpmetrics.plot_confusion_matrix(ser_ytest, ypreds)

<matplotlib.axes._subplots.AxesSubplot at 0x7f93d228c050>


fig, ax = plt.subplots(figsize=(12,8))
skpmetrics.plot_roc(ser_ytest,yprobs,ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x7f93d2415fd0>


notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole noteook: 0 hr 9 min 21 secs

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	0.624501	0.066084	0.717293	-0.165946	2.345865	-2.890083	1.109969	-0.121359	-2.261857	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	-0.226487	0.178228	0.507757	-0.287924	-0.631418	-1.059647	-0.684093	1.965775	-1.232622	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	-0.822843	0.538196	1.345852	-1.119670	0.175121	-0.451449	-0.237033	-0.038195	0.803487	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
36001	38355.0	1.043949	0.318555	1.045810	2.805989	-0.561113	-0.367956	0.032736	-0.042333	-0.322674	0.499167	-0.572665	0.346009	-0.047407	-0.098964	-0.663284	0.181411	-0.124345	-0.790453	-0.720944	-0.084556	-0.240105	-0.680315	0.085328	0.684812	0.318620	-0.204963	0.001662	0.037894	49.67
12844	22555.0	-1.665159	0.808440	1.805627	1.903416	-0.821627	0.934790	-0.824802	0.975890	1.747469	-0.658751	1.281502	-1.430087	0.372028	1.403024	-2.739413	-1.331766	1.964590	-0.205639	1.325588	-0.373759	-0.335332	-0.510994	0.035839	0.147565	-0.529358	-0.566950	-0.595998	-0.220086	16.94
2873	2431.0	-0.324096	0.601836	0.865329	-2.138000	0.294663	-1.251553	1.072114	-0.334896	1.071268	-1.109522	-1.016020	-0.654945	-1.473470	0.317345	1.067491	-0.372642	-0.674725	0.369841	0.095583	-0.039868	0.012220	0.352856	-0.341505	-0.145791	0.094194	-0.804026	0.229428	-0.021623	1.00
145263	86773.0	-0.258270	1.217501	-0.585348	-0.875347	1.222481	-0.311027	1.073860	-0.161408	0.200665	0.154307	0.882673	0.547890	0.269484	-1.253302	-0.883963	0.495221	-0.153212	0.296710	0.136148	0.382305	-0.424626	-0.781158	0.019316	0.178614	-0.315616	0.096665	0.269740	-0.020635	10.78
186658	127202.0	2.142162	-0.494988	-1.936511	-0.818288	-0.025213	-1.027245	-0.151627	-0.305750	-0.869482	0.428729	1.136666	0.273476	0.697123	-1.222134	-0.938820	1.298149	0.912921	-0.793721	1.064984	0.106592	0.010115	0.021722	0.079463	-0.480899	0.023846	-0.279076	-0.030121	-0.043888	39.96

Table of Contents

Data Description¶

Business Problem¶

Introduction to Boosting¶

Colab¶

Imports¶

Useful Scripts¶

Load the data¶

Train test split with stratify¶

Train Validation with stratify¶

Modelling catboost¶

Time Taken¶