import time

time_start_notebook = time.time()


%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot
    !pip install catboost

    # HPO
    !git clone https://github.com/thuijskens/scikit-hyperband.git
    sys.path.append('scikit-hyperband/hyperband')

    print('Environment: Google Colab')


sys.path.append("/Users/poudel/Dropbox/a00_Resources/hyperband")

try:
    from search import HyperbandSearchCV
    print('File found: search.py')
except:
    print('File not found: search.py')
    
try:
    from hyperband_search import HyperbandSearchCV
    print('File found: hyperband_search.py')
except:
    print('File not found: hyperband_search.py')

File not found: search.py
File found: hyperband_search.py


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm, trange
import plotly_express as px

# modelling
import sklearn.metrics as skmetrics
from sklearn.model_selection import StratifiedKFold

# boosting
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

%matplotlib inline
%load_ext watermark
%watermark -iv

joblib         0.17.0
numpy          1.19.4
pandas         1.1.4
plotly_express 0.4.1
seaborn        0.11.0
lightgbm       2.3.1
catboost       0.23.2
autopep8       1.5.2
xgboost        1.2.0
json           2.0.9


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    if show_plots:
        skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
        skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
        skpmetrics.plot_confusion_matrix(ytest,ypreds)


def get_profit(y_true, y_pred):
    tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return profit

scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)


path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'

if ENV_COLAB:
    path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
    path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'


df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)

print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))

(5634, 21)
(1409, 21)


target_name = 'Churn'


px.histogram(df_train, x=target_name,height=300,width=300)


px.histogram(df_train, x='gender', color=target_name,height=300,width=300)


df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)


df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})


df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)

ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})

ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()

index_name = 'customerID'
ser_train_ids = df_Xtrain.pop(index_name)
ser_test_ids = df_Xtest.pop(index_name)


df_Xtrain.head(2)


cols_num = list(df_train.select_dtypes('number').columns)
cols_num

['tenure', 'MonthlyCharges', 'TotalCharges']


cols_cat = list(df_train.select_dtypes('object').columns)

# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']

print(cols_cat)

['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']


cols_num = ['TotalCharges','tenure', 'MonthlyCharges']


cols_num_old = cols_num
cols_cat_old = cols_cat


def combine_two_features(dfx,A,B):
    dfx = dfx.copy()
    assert len(A) == len(B)
    for a,b in zip(A,B):
        dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]

    return dfx

combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
                       'TechSupport','PaymentMethod']

cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]


cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)

df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)

['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']


def create_groupby_features(dfx,cat,num,agg):
    dfx = dfx.copy()
    for c in cat:
        for n in num:
            for a in agg:
                name = f"{c}_{n}_{a}"
                dfx[name] = df_train.groupby(c)[n].transform(a)
    return dfx


# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']

cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']

cols_num_new = [f'{c}_{n}_{a}' 
                for c in cols_grpcat
                for n in cols_grpnum
                for a in cols_grpagg]

cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)

df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)

['Contract_TotalCharges_mean']


df_Xtrain.head(2)


cols_drop = ['gender']

df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)


all_features = df_Xtrain.columns.tolist()
cols_cat_idx = [all_features.index(i)
                        for i in cols_cat]


# make sure no nans

df_Xtrain.isna().sum().sum(), df_Xtest.isna().sum().sum()

(0, 0)


df_Xtrain_full = df_Xtrain.copy()
ser_ytrain_full = ser_ytrain.copy()
ytrain_full = np.array(ser_ytrain_full).flatten()


from sklearn.model_selection import train_test_split

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_full, ser_ytrain_full,
    test_size=0.2,
    random_state=SEED,
    stratify=ser_ytrain_full)

ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()


print(f"df_train   : {df_train.shape}\n")

print(f"df_Xtrain  : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")

print(f"df_Xvalid  : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")

print(f"df_test    : {df_test.shape}")
print(f"ser_ytest  : This does not exist.")

df_Xtrain.head(2)

df_train   : (5634, 21)

df_Xtrain  : (4507, 25)
ser_ytrain : (4507,)

df_Xvalid  : (1127, 25)
ser_yvalid : (1127,)

df_test    : (1409, 21)
ser_ytest  : This does not exist.


# https://stackoverflow.com/questions/65462220/how-to-create-custom-eval-metric-for-catboost

from sklearn.metrics import confusion_matrix 
from scipy.special import expit

class ProfitMetric:
    
    @classmethod
    def get_profit(self, y_true, log_odd):
        y_true = y_true.astype(int)

        # catboost gives outputs as raw log_odds,
        # we need to get prediction from it.
        # logit(p) = log(p/(1-p))
        # expit(x) = 1/(1+exp(-x))
        y_pred = expit(log_odd).astype(int)

        #print("ACCURACY:",(y_pred==y_true).mean())
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        loss = 400*tp - 200*fn - 100*fp
        return loss
    
    def is_max_optimal(self):
        return True # greater is better

    def evaluate(self, approxes, target, weight):            
        assert len(approxes) == 1
        # for binary classification, len(approxed)=1
        assert len(target) == len(approxes[0])
        
        y_true = np.array(target).astype(int)
        log_odd = approxes[0]

        score = self.get_profit(y_true, log_odd)
        output_weight = 1 # weight is not used

        return score, output_weight

    def get_final_error(self, error, weight):
        return error


from catboost import CatBoostClassifier
# CatBoostClassifier?


import catboost
show_methods(catboost)


show_methods(catboost.CatBoostClassifier)


# catboost.CatBoostClassifier.fit?


catboost.CatBoostClassifier().fit

<bound method CatBoostClassifier.fit of <catboost.core.CatBoostClassifier object at 0x7fe2b4b19a90>>


df_Xtrain.head()


print('cat_features: ', sorted(cols_cat_idx))

cat_features:  [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23]


model = CatBoostClassifier(
    n_estimators=1000,
    random_state=SEED,
    cat_features=cols_cat_idx,
    scale_pos_weight=4,
    eval_metric=ProfitMetric()
    )

model.fit(df_Xtrain,ser_ytrain,plot=True,verbose=False,
          eval_set=(df_Xvalid,ser_yvalid),
          use_best_model=True,
          early_stopping_rounds=50
         )

<catboost.core.CatBoostClassifier at 0x7fe2b4f913d0>


vdpreds = model.predict(df_Xvalid)
vdprobs2d = model.predict_proba(df_Xvalid)
vdpreds = vdprobs2d[:,1].astype(int)
yvalid = np.array(ser_yvalid)

print(confusion_matrix(yvalid, vdpreds))
profit = get_profit(yvalid,vdpreds)
print(f'validation profit = ${profit:,d}')

[[828   0]
 [299   0]]
validation profit = $-59,800


ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
model_eval_bin('catboost',ytest,ypreds,yprobs2d,show_plots=False)
profit = get_profit(ytest,ypreds)
print(f'test profit = ${profit:,d}')

              precision    recall  f1-score   support

           0       0.74      0.47      0.57      1035
           1       0.27      0.56      0.37       374

    accuracy                           0.49      1409
   macro avg       0.51      0.51      0.47      1409
weighted avg       0.62      0.49      0.52      1409

[[483 552]
 [166 208]]

test profit = $-5,200


history = model.get_evals_result()
print(history.keys())

dict_keys(['learn', 'validation'])


metric_name_ = list(history['learn'].keys())[0]
metric_name_

'Logloss'


len(history['learn'][metric_name_]) # out of 1000 only 71 trees were built.

51


print(f" training   profit = {np.mean(history['learn']['ProfitMetric']):,.0f}")
print(f" validation profit = {np.mean(history['validation']['ProfitMetric']):,.0f}")

 training   profit = -239,200
 validation profit = -59,800


import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress

from optuna.pruners import SuccessiveHalvingPruner


show_methods(optuna)


params_optuna_study = dict(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=SEED),
    study_name='catboost_optuna',
    storage='sqlite:///catboost_optuna_churn.db',
    load_if_exists=True,
    pruner=optuna.pruners.SuccessiveHalvingPruner(min_resource=100)
)

study = optuna.create_study(**params_optuna_study)
n_studies = len(study.trials)
print(f'Number of finished trials: {n_studies}')

Number of finished trials: 57


path_early_stop_dict = '../artifacts/catboost_optuna_early_stop_dict.joblib'
if n_studies == 0:
    early_stop_dict = {}
else:
    early_stop_dict = joblib.load(path_early_stop_dict)
    print('last study early stopping rounds\n'+'='*35)
    print(early_stop_dict[n_studies-1])

last study early stopping rounds
===================================
[146, 146, 146, 146, 146]


def objective_no_skf(trial):
    
    global early_stop_dict

    params_cat_optuna = {
        'n_estimators': trial.suggest_int('n_estimators', 100,2000),
        #'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        # usually catboost automatically select best learning rate.
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 2,20),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1),
        
        # sampling rows and columns
        # in catboost theres in no colsample_bytree but colsample_bylevel
        'subsample': trial.suggest_uniform('subsample', 0.6, 1),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.6, 1),
        'used_ram_limit': '3gb'
    }

    # fit the model
    model = CatBoostClassifier(random_state=SEED,
                               cat_features=cols_cat_idx,
                               **params_cat_optuna)
    model.fit(df_Xtrain, ser_ytrain,
            eval_set=[(df_Xvalid, ser_yvalid)],
            use_best_model=True,
            verbose=0,
            early_stopping_rounds=100)

    # save early stopping dictionary
    history = model.get_evals_result()
    metric_name_ = list(history['learn'].keys())[0]
    n_rounds = len(history['learn'][metric_name_])
    early_stop_dict[objective.i] = n_rounds
    joblib.dump(early_stop_dict, path_early_stop_dict)

    ypreds = model.predict(df_Xvalid)
    ypreds = np.rint(ypreds)
    #score = skmetrics.roc_auc_score(ser_yvalid.to_numpy().ravel(),ypreds)
    score = get_profit(ser_yvalid.to_numpy().ravel(),ypreds)

    # counter to update early stopping dict
    objective.i +=1

    return score


def objective(trial): # this is slow but more stable.
    
    global early_stop_dict

    params_cat_optuna = {
        'n_estimators': trial.suggest_int('n_estimators', 100,2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [2,3,4,5]),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1),
        'used_ram_limit': '3gb'
    }
    # skf is more time-consuming but more stable.
    skf = StratifiedKFold(n_splits=5,random_state=SEED,shuffle=True)
    scores = []
    lst_early_rounds = []
    for idx_tr, idx_vd in skf.split(df_Xtrain_full, ser_ytrain_full):
        Xtr,Xvd = df_Xtrain_full.iloc[idx_tr], df_Xtrain_full.iloc[idx_vd]
        ytr,yvd = ser_ytrain_full[idx_tr], ser_ytrain_full.iloc[idx_vd]

        model = CatBoostClassifier(random_state=SEED,
                               cat_features=cols_cat_idx,
                               **params_cat_optuna)
        model.fit(Xtr, ytr,
            eval_set=[(Xvd, yvd)],
            use_best_model=False,
            verbose=0,
            early_stopping_rounds=100)
        
        
        # save early stopping dictionary
        history = model.get_evals_result()
        metric_name_ = list(history['learn'].keys())[0]
        lst_early_rounds.append(len(history['learn'][metric_name_]))

        ypreds = model.predict(Xvd)
        ypreds = np.rint(ypreds)
        #score_ = skmetrics.roc_auc_score(ser_yvalid.to_numpy().ravel(),ypreds)
        score_ = get_profit(yvd.to_numpy().ravel(),ypreds)
        scores.append(score_)

    #==============================================================
    score = np.mean(scores) # sometimes we can also use np.max 

    # counter to update early stopping dict
    early_stop_dict[objective.i] = lst_early_rounds
    joblib.dump(early_stop_dict, path_early_stop_dict)
    objective.i +=1

    return score


hasattr(objective,'i')

False


%%time

# NOTE: there is inherent non-determinism in optuna hyperparameter selection
#       we may not get the same hyperparameters when run twice.


if not hasattr(objective,'i'):
    objective.i = len(study.trials)
    
N_TRIALS = 1 # make it large
study.optimize(objective, n_trials=N_TRIALS,timeout=600)

CPU times: user 53.1 s, sys: 2.58 s, total: 55.7 s
Wall time: 18.4 s


show_methods(study)


# study.get_trials()
# FrozenTrial starting from number=0 ,1, ...


study.best_trial

FrozenTrial(number=50, value=68060.0, datetime_start=datetime.datetime(2020, 12, 30, 10, 8, 59, 396461), datetime_complete=datetime.datetime(2020, 12, 30, 10, 9, 35, 502818), params={'learning_rate': 0.01188013991897388, 'max_depth': 3, 'n_estimators': 1719, 'reg_lambda': 0.2746130196964879, 'scale_pos_weight': 5, 'subsample': 0.9449631695153495}, distributions={'learning_rate': LogUniformDistribution(high=1.0, low=0.01), 'max_depth': IntUniformDistribution(high=12, low=3, step=1), 'n_estimators': IntUniformDistribution(high=2000, low=100, step=1), 'reg_lambda': UniformDistribution(high=1, low=0.01), 'scale_pos_weight': CategoricalDistribution(choices=(2, 3, 4, 5)), 'subsample': UniformDistribution(high=1, low=0.6)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=51, state=TrialState.COMPLETE)


lst_best_n_estimators = early_stop_dict[study.best_trial.number]

best_n_estimators = np.max(early_stop_dict[study.best_trial.number]) 
lst_best_n_estimators, best_n_estimators

([1077, 926, 1214, 1053, 1078], 1214)


%%time
# Resume from last time
N_TRIALS = 10 # make it large

study = optuna.create_study(**params_optuna_study)
study.optimize(objective,
               n_trials=N_TRIALS,
               timeout=600,
               show_progress_bar=True)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/optuna/progress_bar.py:46: ExperimentalWarning:

Progress bar is experimental (supported from v1.2.0). The interface can change in the future.

CPU times: user 12min 28s, sys: 42.6 s, total: 13min 11s
Wall time: 4min 34s


# %%time

# # Resume from last time
# N_TRIALS = 10 # make it large eg. 10, 20, 100

# for _ in trange(N_TRIALS):
#     study = optuna.create_study(**params_optuna_study)
#     study.optimize(objective, n_trials=1)


show_methods(optuna.visualization)


optuna.visualization.plot_optimization_history(study)


optuna.visualization.plot_param_importances(study)


fig = optuna.visualization.plot_parallel_coordinate(study)
fig['layout']['width'] = 800

fig.show()


optuna.visualization.plot_slice(study,
    params=['learning_rate','max_depth']
                               )


optuna.visualization.plot_contour(study,params=['learning_rate','max_depth'])


print(f'Number of finished trials: {len(study.trials)}')

# best trial
best_trial = study.best_trial

# best params (note: even if we fix random_state, params changes each time.)
params_best = study.best_trial.params
params_best

Number of finished trials: 68

{'learning_rate': 0.01188013991897388,
 'max_depth': 3,
 'n_estimators': 1719,
 'reg_lambda': 0.2746130196964879,
 'scale_pos_weight': 5,
 'subsample': 0.9449631695153495}


notes = """
{'learning_rate': 0.01147263253168174,
 'max_depth': 4,
 'n_estimators': 1511,
 'reg_lambda': 0.18891811085825794,
 'scale_pos_weight': 5,
 'subsample': 0.811440063402815} # gives $84,500




"""


# 1. each time we run, we get different best params, optuna is not deterministic.
# 2. For some reason with 21 round I got 84k but with 31 rounds $82k.
#    This is because these hyperparams will overfit the test set, it will not
#    generalize for other validation folds. Therefore, use skf objective.

# this gives profit = $84,800
# I got this without using skf in optuna objective
# params_best = {'colsample_bylevel': 0.8028576058235254,
#  'max_depth': 3,
#  'n_estimators': 1258,
#  'reg_lambda': 0.9775888228189168,
#  'scale_pos_weight': 10,
#  'subsample': 0.6068141443822124}


model = CatBoostClassifier(**params_best,cat_features=cols_cat_idx,
                           verbose=False,random_state=SEED)

model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)

yprobs2d = model.predict_proba(df_Xtest)
model_eval_bin('catboost+optuna',ytest,ypreds,yprobs2d,show_plots=False)

profit = get_profit(ytest,ypreds)
print(f"profit = ${profit:,d}")

              precision    recall  f1-score   support

           0       0.94      0.63      0.75      1035
           1       0.46      0.89      0.61       374

    accuracy                           0.70      1409
   macro avg       0.70      0.76      0.68      1409
weighted avg       0.81      0.70      0.71      1409

[[648 387]
 [ 42 332]]

profit = $85,700


model_eval_bin('catboost+optuna',ytest,ypreds,yprobs2d,show_plots=True)

              precision    recall  f1-score   support

           0       0.94      0.63      0.75      1035
           1       0.46      0.89      0.61       374

    accuracy                           0.70      1409
   macro avg       0.70      0.76      0.68      1409
weighted avg       0.81      0.70      0.71      1409

[[648 387]
 [ 42 332]]

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:86: FutureWarning:

Function plot_roc_curve is deprecated; This will be removed in v0.5.0. Please use scikitplot.metrics.plot_roc instead.


import shap
shap.initjs()


explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.


df_Xtest.head(2)


# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
                shap_values[idx,:],
                df_Xtest.iloc[idx,:],
                matplotlib=False,
                text_rotation=90)

# for this row, the predicted label is ...
# red features makes it higher
# blue features makes it smaller.


shap.summary_plot(shap_values, df_Xtest)


shap.summary_plot(shap_values, df_Xtest, plot_type='bar')


shap.dependence_plot(ind='TotalCharges', interaction_index='tenure',
                     shap_values=shap_values, 
                     features=df_Xtest,  
                     display_features=df_Xtest)


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 5 min 23 secs

	0	1	2	3
0	CatBoost	EFstrType	Pool	to_regressor
1	CatBoostClassifier	FeaturesData	core	train
2	CatBoostError	MetricVisualizer	cv	version
3	CatBoostRegressor	MultiRegressionCustomMetric	sum_models	widget
4	CatboostError	MultiRegressionCustomObjective	to_classifier

	0	1	2	3
0	best_iteration_	get_best_iteration	get_test_evals	random_seed_
1	best_score_	get_best_score	get_text_feature_indices	randomized_search
2	calc_feature_statistics	get_borders	get_tree_leaf_counts	save_borders
3	calc_leaf_indexes	get_cat_feature_indices	grid_search	save_model
4	classes_	get_evals_result	is_fitted	score
5	compare	get_feature_importance	iterate_leaf_indexes	set_feature_names
6	copy	get_leaf_values	learning_rate_	set_leaf_values
7	create_metric_calcer	get_leaf_weights	load_model	set_params
8	drop_unused_features	get_metadata	plot_partial_dependence	set_scale_and_bias
9	eval_metrics	get_object_importance	plot_predictions	shrink
10	evals_result_	get_param	plot_tree	staged_predict
11	feature_importances_	get_params	predict	staged_predict_log_proba
12	feature_names_	get_scale_and_bias	predict_log_proba	staged_predict_proba
13	fit	get_test_eval	predict_proba	tree_count_
14	get_all_params

	0	1	2	3
0	Any	delete_study	load_study	structs
1	Study	distributions	logging	study
2	TYPE_CHECKING	exceptions	multi_objective	trial
3	Trial	get_all_study_summaries	progress_bar	type_checking
4	TrialPruned	importance	pruners	types
5	create_study	importlib	samplers	version
6	create_trial	integration	storages	visualization
7	dashboard

Modelling Customer Churn using Catboost

Load the libraries

Colab¶

Useful Scripts

Load the Data

Data Processing

Data Processing¶

Data Types¶

Train and Test Data¶

Numerical and Categorical Features¶

Custom Features¶

Train Validation Split¶

Modelling

Catboost HPO Using Optuna¶

Optuna Visualization

Model Evaluation

Model Evaluation using SHAP¶

Time Taken

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	1621-YNCJH	Female	Yes	No	36	Yes	Yes	Fiber optic	Yes	Yes	Yes	Yes	No	Yes	Two year	Yes	Credit card (automatic)	106.05	3834.4	No
1	7143-BQIBA	Male	No	No	10	Yes	No	DSL	Yes	No	No	Yes	Yes	No	Month-to-month	No	Bank transfer (automatic)	62.25	612.95	No
5632	0862-PRCBS	Female	Yes	Yes	68	Yes	Yes	Fiber optic	No	Yes	No	Yes	Yes	Yes	Two year	Yes	Credit card (automatic)	103.75	7039.45	No
5633	4656-CAURT	Male	No	No	69	Yes	Yes	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Bank transfer (automatic)	23.95	1713.1	No

	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Partner_Dependents	SeniorCitizen_Dependents	SeniorCitizen_Partner	SeniorCitizen_Contract	SeniorCitizen_TechSupport	SeniorCitizen_PaymentMethod	Contract_TotalCharges_mean
4555	No	No	No	16	Yes	No	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Month-to-month	No	Credit card (automatic)	19.75	294.90	No_No	No_No	No_No	No_Month-to-month	No_No internet service	No_Credit card (automatic)	1370.923131
3379	No	Yes	No	72	No	No phone service	DSL	Yes	Yes	Yes	Yes	Yes	Yes	Two year	Yes	Electronic check	64.70	4746.05	Yes_No	No_No	No_Yes	No_Two year	No_Yes	No_Electronic check	3683.643192

	0	1	2	3
0	add_trial	enqueue_trial	set_system_attr	system_attrs
1	best_params	get_trials	set_user_attr	trials
2	best_trial	optimize	stop	trials_dataframe
3	best_value	pruner	study_name	user_attrs
4	direction	sampler

	0	1	2	3
0	is_available	plot_edf	plot_optimization_history	plot_param_importances
1	plot_contour	plot_intermediate_values	plot_parallel_coordinate	plot_slice

	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Partner_Dependents	SeniorCitizen_Dependents	SeniorCitizen_Partner	SeniorCitizen_Contract	SeniorCitizen_TechSupport	SeniorCitizen_PaymentMethod	Contract_TotalCharges_mean
0	No	No	No	1	Yes	No	DSL	Yes	No	No	No	No	No	Month-to-month	Yes	Credit card (automatic)	48.6	48.6	No_No	No_No	No_No	No_Month-to-month	No_No	No_Credit card (automatic)	3683.643192
1	Yes	No	No	56	Yes	Yes	Fiber optic	No	Yes	Yes	Yes	Yes	No	Two year	Yes	Bank transfer (automatic)	99.9	5706.3	No_No	Yes_No	Yes_No	Yes_Two year	Yes_Yes	Yes_Bank transfer (automatic)	1370.923131