import time

time_start_notebook = time.time()


import numpy as np
import pandas as pd

SEED = 0
RNG = np.random.RandomState(SEED)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline

# mixed
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)

# scale and split
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.isotonic import IsotonicRegression

# sklearn scalar metrics
import sklearn.metrics as skmetrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# xgboost
import xgboost
import xgboost as xgb
from xgboost import XGBClassifier

# six and pickle
import six
import pickle
import joblib

# hyperopt
import hyperopt
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample

# optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress

# model evaluation
import shap
import lime
import eli5
from eli5.sklearn import PermutationImportance
import yellowbrick
import scikitplot

# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The sklearn.metrics.scorer module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
The sklearn.feature_selection.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.

Bhishan Poudel 2021-08-10 

CPython 3.7.7
IPython 7.22.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

shap        0.39.0
json        2.0.9
hyperopt    0.2.3
pandas      1.3.0
eli5        0.10.1
autopep8    1.5.2
joblib      1.0.1
numpy       1.19.5
sklearn     0.23.1
six         1.16.0
yellowbrick 1.1
scikitplot  0.3.7
xgboost     1.3.3
optuna      2.7.0

The sklearn.metrics.classification module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
from bhishan import bp


%load_ext autoreload
%autoreload 2


def get_profit(y_true, y_pred):
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return profit

scoring = sklearn.metrics.make_scorer(get_profit, greater_is_better=True)


ifile = '../data/raw/creditcard.csv.zip'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()

(284807, 31)


target = 'Class'
df[target].value_counts(normalize=True)*100

0    99.827251
1     0.172749
Name: Class, dtype: float64


from sklearn.model_selection import train_test_split

target = 'Class'
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
    df.drop(target,axis=1), 
    df[target],
    test_size=0.2, 
    random_state=SEED, 
    stratify=df[target])

ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()


df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_orig, 
    ser_ytrain_orig,
    test_size=0.2, 
    random_state=SEED, 
    stratify=ser_ytrain_orig)

ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
Xvalid = df_Xvalid.to_numpy()
yvalid = ser_yvalid.to_numpy().ravel()

print(df_Xtrain.shape)
df_Xtrain.head()

(182276, 30)


ser_ytrain[:5]

138257    0
60033     0
31064     0
245706    0
25871     0
Name: Class, dtype: int64


%%time
from xgboost import XGBClassifier

model = XGBClassifier(random_state=SEED,
                      objective = "binary:logistic",
                      eval_metric='logloss',
                      use_label_encoder=False,
                      n_estimators=10 # make it large
                     )
model.fit(df_Xtrain,ser_ytrain.values)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=10, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
CPU times: user 16.7 s, sys: 266 ms, total: 16.9 s
Wall time: 8.95 s


bp.show_methods(model,5)


# model.score? # mean accuracy


acc = model.score(df_Xtest,ser_ytest.values)
acc

0.999420666409185


ypreds = model.predict(df_Xtest)

acc = sklearn.metrics.accuracy_score(ser_ytest.values,ypreds)
print('accuracy: ', acc)
ypreds[:5], sum(ypreds)

accuracy:  0.999420666409185

(array([0, 0, 0, 0, 0]), 77)


yprobs1d = model.predict_proba(df_Xtest)[:,1] # take second column

auc = roc_auc_score(ser_ytest.to_numpy().ravel(),yprobs1d)
print('AUC = ', auc)

AUC =  0.9155406060144936


profit = get_profit(ser_ytest.to_numpy().ravel(), ypreds)
print(f"profit = ${profit:,}")

profit = $22,400


# https://github.com/dmlc/xgboost/issues/1125
def eval_profit(logodds, dtrain):
    """Custom evaluation metric for xgboost.

    Usage:
    -------
    PARAMS_FIT = dict(
        early_stopping_rounds=5, #y_pred = model.predict(X, ntree_limit=model.best_ntree_limit)
        verbose=10,
        eval_set= [(df_Xtrain,ser_ytrain),(df_Xvalid,ser_yvalid)], # for two tuples, we get validation_0 and 1.
        eval_metric=eval_profit,
    )
    
    model = XGBClassifier()
    model.fit(X,y,**PARAMS_FIT)
    evals_result = model.evals_result()
    df_eval1 = pd.DataFrame(evals_result['validation_1'])
    """
    y_prob = 1.0 / (1.0 + np.exp(-logodds))
    labels = dtrain.get_label()

    # profit
    y_pred = (np.array(y_prob)>0.5).astype(np.int8)
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(labels,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return 'profit',profit


# https://github.com/dmlc/xgboost/issues/1125
def eval_auc_aucpr_profit(logodds, dtrain):
    """Custom evaluation metric for xgboost.

    Usage:
    -------
    PARAMS_FIT = dict(
        early_stopping_rounds=5, #y_pred = model.predict(X, ntree_limit=model.best_ntree_limit)
        verbose=10,
        eval_set= [(df_Xtrain,ser_ytrain),(df_Xvalid,ser_yvalid)], # for two tuples, we get validation_0 and 1.
        eval_metric=eval_auc_aucpr_profit,
    )
    
    model = XGBClassifier()
    model.fit(X,y,**PARAMS_FIT)
    evals_result = model.evals_result()
    df_eval1 = pd.DataFrame(evals_result['validation_1'])
    """

    y_prob = 1.0 / (1.0 + np.exp(-logodds))
    labels = dtrain.get_label()

    # auc
    auc = sklearn.metrics.roc_auc_score(labels, y_prob)
    
    # aucpr
    pre,rec,thr = sklearn.metrics.precision_recall_curve(labels,y_prob)
    aucpr = sklearn.metrics.auc(rec,pre)

    # profit
    y_pred = (np.array(y_prob)>0.5).astype(np.int8)
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(labels,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return [('auc', auc), ('aucpr', aucpr),('profit',profit)]


%%time
from xgboost import XGBClassifier

model = XGBClassifier(random_state=SEED,
                      objective = "binary:logistic",
                      eval_metric='logloss',
                      use_label_encoder=False,
                      n_estimators=20, # make it large
                     )

PARAMS_FIT = dict(
    early_stopping_rounds=5, #y_pred = model.predict(X, ntree_limit=model.best_ntree_limit)
    verbose=10,
    eval_set= [(df_Xtrain,ser_ytrain),(df_Xvalid,ser_yvalid)], # for two tuples, we get validation_0 and 1.
    eval_metric=eval_auc_aucpr_profit, # NOTE: xvalid uses eval_metric for early stop.
    )


model.fit(df_Xtrain,ser_ytrain.values,**PARAMS_FIT)

[0]	validation_0-logloss:0.43780	validation_0-auc:0.91259	validation_0-aucpr:0.86564	validation_0-profit:86500.00000	validation_1-logloss:0.43788	validation_1-auc:0.94924	validation_1-aucpr:0.87099	validation_1-profit:19900.00000
[4]	validation_0-logloss:0.10771	validation_0-auc:0.91575	validation_0-aucpr:0.87211	validation_0-profit:91800.00000	validation_1-logloss:0.10767	validation_1-auc:0.96824	validation_1-aucpr:0.90433	validation_1-profit:21600.00000
CPU times: user 13.9 s, sys: 213 ms, total: 14.1 s
Wall time: 4.7 s

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)


evals_result = model.evals_result()
evals_result.keys() # how many datasets are in eval_set?

dict_keys(['validation_0', 'validation_1'])


dfs_evals_result = []
for key in evals_result.keys():
    _ = pd.DataFrame(evals_result[key])
    _.index.name = key
    dfs_evals_result.append(_)


df_eval0 = dfs_evals_result[0]
df_eval0


df_eval0.plot(subplots=True,figsize=(12,12));


# predictions


y_pred = model.predict(df_Xtest)
sum(y_pred)

76


y_pred_best_early = model.predict(df_Xtest, ntree_limit=model.best_ntree_limit)
sum(y_pred_best_early)

76


y_prob = model.predict_proba(df_Xtest, ntree_limit=model.best_ntree_limit)
y_prob[:2]

array([[0.6455703 , 0.35442975],
       [0.6455703 , 0.35442975]], dtype=float32)


y_prob1d = y_prob[:,1] # take second column of prob
y_prob1d[:5]

array([0.35442975, 0.35442975, 0.35442975, 0.35442975, 0.35442975],
      dtype=float32)


y_pred = (np.array(y_prob1d) > 0.5).astype(np.int8)
sum(y_pred), y_pred[:5]

(76, array([0, 0, 0, 0, 0], dtype=int8))


def objective(trial):
    params_xgb_optuna = {
        'eval_metric': 'auc',
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1.0),
        'max_depth':trial.suggest_int('max_depth', 5, 20),
        'n_estimators': trial.suggest_int('n_estimators', 150, 1000), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 100.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
    }

    model = XGBClassifier(random_state=SEED,**params_xgb_optuna)
    model.fit(df_Xtrain,ser_ytrain)

    vdpreds = model.predict(df_Xvalid)
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(yvalid,vdpreds).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return profit


# NOTE: there is inherent non-determinism in optuna hyperparameter selection
#       we may not get the same hyperparameters when run twice.


sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large

study = optuna.create_study(direction='maximize',
                            sampler=sampler,
                            study_name='xgb_optuna',
                            storage='sqlite:///xgb_optuna_fraud_classifcation2.db',
                            load_if_exists=True)

study.optimize(objective, n_trials=N_TRIALS)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning:

The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].


# best params
params_best = study.best_trial.params
params_best

{'learning_rate': 0.01567667719550607,
 'max_depth': 16,
 'n_estimators': 662,
 'reg_alpha': 0.034828020870283326,
 'reg_lambda': 0.028770084050677908,
 'subsample': 0.863464954899069}


%%time

model_name = 'xgboost'
desc = 'grid search optuna custom loss'
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()

# use best model
params_best =  study.best_trial.params

model = xgb.XGBClassifier(random_state=SEED,eval_metric='logloss')
model.set_params(**params_best)

# fit the model
model.fit(Xtr, ytr,verbose=50)

# prediction
ypreds = model.predict(df_Xtest)

# evaluation
profit = get_profit(ytest,ypreds)
print(f"Profit = {profit:,}.")

Profit = 23,700.
CPU times: user 31min 14s, sys: 3.05 s, total: 31min 17s
Wall time: 8min 11s


from sklearn.isotonic import IsotonicRegression


# y_prob given from xgboost may not be well calibrated for imbalanced data
# let's get another set of probabilities.

vd_prob = model.predict_proba(df_Xvalid)
vd_prob[:5]

array([[9.9998218e-01, 1.7828401e-05],
       [9.9994946e-01, 5.0526865e-05],
       [9.9997103e-01, 2.8985065e-05],
       [9.9996608e-01, 3.3921599e-05],
       [9.9997801e-01, 2.1999766e-05]], dtype=float32)


# fit the model on validaton data
model_isoreg = IsotonicRegression(y_min=0,y_max=1,out_of_bounds='clip')
model_isoreg.fit(vd_prob[:,1], ser_yvalid.values)

IsotonicRegression(out_of_bounds='clip', y_max=1, y_min=0)


# raw probabilities
vd_prob = model.predict_proba(df_Xvalid)
vd_pred = (vd_prob[:,1] >0.5).astype(np.int8)

# calibrated probabilities
vd_prob_iso = model_isoreg.predict(vd_prob[:,1])

# predictions from probabilities
vd_pred_iso = (np.array(vd_prob_iso) > 0.5).astype(np.int8)

profit = get_profit(ser_yvalid.to_numpy().ravel(), vd_pred)

# interestingly, isotonic regression gave me worse result.
profit_iso = get_profit(ser_yvalid.to_numpy().ravel(), vd_pred_iso)

print(f"validation profit without calibration                 : ${profit:,}")
print(f"validation profit with isotonic regression calibration: ${profit_iso:,}")

validation profit without calibration                 : $31,600
validation profit with isotonic regression calibration: $31,600


# raw probabilities
y_prob = model.predict_proba(df_Xtest)
y_pred = (y_prob[:,1] >0.5).astype(np.int8)

# calibrated probabilities
y_prob_iso = model_isoreg.predict(y_prob[:,1])

# predictions from probabilities
y_pred_iso = (np.array(y_prob_iso) > 0.5).astype(np.int8)

profit = get_profit(ser_ytest.to_numpy().ravel(), y_pred)

# interestingly, isotonic regression gave me worse result.
profit_iso = get_profit(ser_ytest.to_numpy().ravel(), y_pred_iso)


print(f"profit without calibration                 : ${profit:,}")
print(f"profit with isotonic regression calibration: ${profit_iso:,}")

profit without calibration                 : $23,700
profit with isotonic regression calibration: $24,200


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 15 min 27 secs

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
138257	82565.0	1.118591	0.562709	0.569628	2.987496	-0.365594	-0.531789	-0.044144	0.011932	-0.129131	...	-0.204184	-0.128269	-0.218875	-0.048816	0.617265	0.551384	0.060220	0.016136	0.047100	7.6
60033	49125.0	1.170686	0.083759	0.466278	0.913911	-0.093123	0.427588	-0.372727	0.312777	0.129610	...	-0.226078	-0.176121	-0.584726	0.066051	-0.746667	0.232641	-0.547740	0.038060	0.010995	3.9
31064	36195.0	1.072902	-0.015166	0.942251	1.330631	-0.580474	0.206235	-0.402121	0.313133	0.410088	...	-0.251464	-0.261720	-0.665725	0.167535	0.163815	0.192247	-0.620974	0.050609	0.019181	9.9
245706	152869.0	2.136909	0.088646	-2.490914	0.098321	0.789008	-1.399582	0.854902	-0.492912	-0.254999	...	-0.266383	0.278034	0.934892	-0.211839	-0.234266	0.609699	1.020898	-0.154427	-0.112532	2.0
25871	33805.0	-2.448378	-1.335508	1.240431	1.800068	0.383084	-0.501160	1.080410	-0.604093	-0.319458	...	-0.720572	-0.121319	0.625541	-0.639100	0.522532	-0.073801	-0.162788	0.294912	-0.211222	411.1

	logloss	auc	aucpr	profit
validation_0
0	0.437801	0.912590	0.865638	86500.0
1	0.297067	0.912593	0.868193	86500.0
2	0.208065	0.915748	0.868741	90000.0
3	0.148686	0.915746	0.868922	91800.0
4	0.107707	0.915747	0.872107	91800.0
5	0.078608	0.917326	0.872466	91900.0

Table of Contents

Introduction to Boosting¶

Imports¶

Useful Functions¶

Load the data¶

Train test split with stratify¶

Train Validation with stratify¶

Modelling xgboost imbalanced data¶

Default Method¶

Using custom eval function (Not custom objective function with feval)¶

Tune hyperparameters using Optuna with custom loss¶

Calibrate the probabilities (Stacking models)¶

Time Taken¶

	0	1	2	3	4
0	apply	fit	kwargs	n_features_in_	save_model
1	base_score	gamma	learning_rate	n_jobs	scale_pos_weight
2	booster	get_booster	load_model	num_parallel_tree	score
3	classes_	get_num_boosting_rounds	max_delta_step	objective	set_params
4	coef_	get_params	max_depth	predict	subsample
5	colsample_bylevel	get_xgb_params	min_child_weight	predict_proba	tree_method
6	colsample_bynode	gpu_id	missing	random_state	use_label_encoder
7	colsample_bytree	importance_type	monotone_constraints	reg_alpha	validate_parameters
8	evals_result	interaction_constraints	n_classes_	reg_lambda	verbosity
9	feature_importances_	intercept_	n_estimators