import time

time_start_notebook = time.time()


%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot
    !pip install catboost

    # HPO
    !git clone https://github.com/thuijskens/scikit-hyperband.git
    sys.path.append('scikit-hyperband/hyperband')

    print('Environment: Google Colab')


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm, trange
import plotly_express as px

# modelling
import sklearn.metrics as skmetrics
from sklearn.model_selection import StratifiedKFold

# boosting
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

%matplotlib inline
%load_ext watermark
%watermark -iv

seaborn        0.11.0
joblib         0.17.0
xgboost        1.2.0
plotly_express 0.4.1
autopep8       1.5.2
json           2.0.9
pandas         1.1.4
lightgbm       2.3.1
numpy          1.19.4
catboost       0.23.2


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    if show_plots:
        skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
        skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
        skpmetrics.plot_confusion_matrix(ytest,ypreds)


def get_profit(y_true, y_pred):
    tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return profit

scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)


path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'

if ENV_COLAB:
    path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
    path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'


df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)

print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))

(5634, 21)
(1409, 21)


target_name = 'Churn'


px.histogram(df_train, x=target_name,height=300,width=300)


px.histogram(df_train, x='gender', color=target_name,height=300,width=300)


df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)


df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})


df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)

ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})

ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()

index_name = 'customerID'
ser_train_ids = df_Xtrain.pop(index_name)
ser_test_ids = df_Xtest.pop(index_name)


df_Xtrain.head(2)


cols_num = list(df_train.select_dtypes('number').columns)
cols_num

['tenure', 'MonthlyCharges', 'TotalCharges']


cols_cat = list(df_train.select_dtypes('object').columns)

# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']

print(cols_cat)

['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']


cols_num = ['TotalCharges','tenure', 'MonthlyCharges']


cols_num_old = cols_num
cols_cat_old = cols_cat


def combine_two_features(dfx,A,B):
    dfx = dfx.copy()
    assert len(A) == len(B)
    for a,b in zip(A,B):
        dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]

    return dfx

combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
                       'TechSupport','PaymentMethod']

cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]


cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)

df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)

['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']


def create_groupby_features(dfx,cat,num,agg):
    dfx = dfx.copy()
    for c in cat:
        for n in num:
            for a in agg:
                name = f"{c}_{n}_{a}"
                dfx[name] = df_train.groupby(c)[n].transform(a)
    return dfx


# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']

cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']

cols_num_new = [f'{c}_{n}_{a}' 
                for c in cols_grpcat
                for n in cols_grpnum
                for a in cols_grpagg]

cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)

df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)

['Contract_TotalCharges_mean']


df_Xtrain.head(2)


cols_drop = ['gender']

df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)


all_features = df_Xtrain.columns.tolist()
cols_cat_idx = [all_features.index(i)
                        for i in cols_cat]


# make sure no nans

df_Xtrain.isna().sum().sum(), df_Xtest.isna().sum().sum()

(0, 0)


df_Xtrain_full = df_Xtrain.copy()
ser_ytrain_full = ser_ytrain.copy()
ytrain_full = np.array(ser_ytrain_full).flatten()


# one hot encode
df_Xtrain_full = pd.get_dummies(df_Xtrain_full,columns=cols_cat)
df_Xtest = pd.get_dummies(df_Xtest,columns=cols_cat)


df_Xtrain_full.head()


# check if all nunique >= 2
df_Xtrain_full.apply(pd.Series.nunique).nsmallest(5)

SeniorCitizen_Partner_No_No        2
SeniorCitizen_Partner_No_Yes       2
SeniorCitizen_Partner_Yes_No       2
SeniorCitizen_Partner_Yes_Yes      2
SeniorCitizen_TechSupport_No_No    2
dtype: int64


# check if all nunique >= 2
df_Xtest.apply(pd.Series.nunique).nsmallest(5)

SeniorCitizen_Partner_No_No        2
SeniorCitizen_Partner_No_Yes       2
SeniorCitizen_Partner_Yes_No       2
SeniorCitizen_Partner_Yes_Yes      2
SeniorCitizen_TechSupport_No_No    2
dtype: int64


# check if all are numbers
df_Xtrain_full.sum().sum(), df_Xtest.sum().sum()

(26266765.14999999, 6655873.894557063)


# check for nans
df_Xtrain_full.isna().sum().sum(), df_Xtest.isna().sum().sum()

(0, 0)


from sklearn.model_selection import train_test_split

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_full, ser_ytrain_full,
    test_size=0.2,
    random_state=SEED,
    stratify=ser_ytrain_full)


Xtrain_full = df_Xtrain_full.to_numpy()
Xtrain = df_Xtrain.to_numpy()
Xvalid = df_Xvalid.to_numpy()

ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()


print(f"df_train   : {df_train.shape}\n")

print(f"df_Xtrain  : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")

print(f"df_Xvalid  : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")

print(f"df_test    : {df_test.shape}")
print(f"ser_ytest  : This does not exist.")

df_Xtrain.head(2)

df_train   : (5634, 21)

df_Xtrain  : (4507, 77)
ser_ytrain : (4507,)

df_Xvalid  : (1127, 77)
ser_yvalid : (1127,)

df_test    : (1409, 21)
ser_ytest  : This does not exist.


model_name = 'lightgbm'
hpo_name = 'optuna'


from lightgbm import LGBMClassifier


model = LGBMClassifier(random_state=SEED,n_estimators=1000)

model.fit(df_Xtrain,ytrain,
          eval_set=(df_Xvalid, ser_yvalid),
          early_stopping_rounds=20,
          verbose=0,
         )


ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)

profit = get_profit(ytest,ypreds)
print(f'test profit = ${profit:,d}')
model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=False)

test profit = $-21,200
              precision    recall  f1-score   support

           0       0.78      0.93      0.85      1035
           1       0.59      0.27      0.37       374

    accuracy                           0.76      1409
   macro avg       0.69      0.60      0.61      1409
weighted avg       0.73      0.76      0.72      1409

[[965  70]
 [273 101]]


show_methods(model)


e = model.evals_result_
out = """
{'valid_0': OrderedDict([('binary_logloss',
               [0.5495546455956963,
                0.5287290543184567,
                ...]

"""

k0 = list(e.keys())[0]
k1 = list(e[k0].keys())[0]
print(e[k0][k1][:2])

#n_used = len(e['valid_0']['binary_logloss']) # only these trees are used

n_used = len(e[k0][k1])
print('early stop used: ', n_used)

[0.5495546455956963, 0.5287290543184567]
early stop used:  48


import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress

from optuna.pruners import SuccessiveHalvingPruner


show_methods(optuna)


show_methods(optuna.trial.Trial)


params_optuna_study = dict(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=SEED),
    study_name=f'{model_name}_{hpo_name}',
    storage='sqlite:///' + model_name + f'_{hpo_name}_churn.db',
    load_if_exists=True,
    pruner=optuna.pruners.SuccessiveHalvingPruner(min_resource=100)
)

study = optuna.create_study(**params_optuna_study)
n_studies = len(study.trials)
print(f'Number of finished trials: {n_studies}')

Number of finished trials: 0


path_early_stop_dict = f'../artifacts/{model_name}_{hpo_name}_early_stop_dict.joblib'
if n_studies == 0:
    early_stop_dict = {}
else:
    early_stop_dict = joblib.load(path_early_stop_dict)
    print('last study early stopping rounds\n'+'='*35)
    print(early_stop_dict[n_studies-1])


def objective(trial): # this is slow but more stable.
    
    global early_stop_dict

    params_lgb_optuna = {
        'n_estimators': trial.suggest_int('n_estimators', 100,5000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [2,3,4,5,7,8,9,10]),

        'reg_alpha' : trial.suggest_uniform('reg_alpha', 0.01, 1),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1),

        'subsample'       : trial.suggest_uniform('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
    }
    # skf is more time-consuming but more stable.
    skf = StratifiedKFold(n_splits=5,random_state=SEED,shuffle=True)
    scores = []
    lst_early_rounds = []
    for idx_tr, idx_vd in skf.split(df_Xtrain_full, ser_ytrain_full):
        Xtr,Xvd = df_Xtrain_full.iloc[idx_tr], df_Xtrain_full.iloc[idx_vd]
        ytr,yvd = ser_ytrain_full[idx_tr], ser_ytrain_full.iloc[idx_vd]

        model = lgb.LGBMClassifier(random_state=SEED,**params_lgb_optuna)
        model.fit(Xtr, ytr,
            eval_set=[(Xvd, yvd)],
            verbose=0,
            early_stopping_rounds=100)

        # save early stopping dictionary
        e = model.evals_result_
        k0 = list(e.keys())[0]
        k1 = list(e[k0].keys())[0]
        n_used = len(e[k0][k1]) # only these trees are used 
        lst_early_rounds.append(n_used)

        ypreds = model.predict(Xvd)
        ypreds = np.rint(ypreds)
        #score_ = skmetrics.roc_auc_score(ser_yvalid.to_numpy().ravel(),ypreds)
        score_ = get_profit(yvd.to_numpy().ravel(),ypreds)
        scores.append(score_)

    #==============================================================
    score = np.mean(scores) # sometimes we can also use np.max 

    # counter to update early stopping dict
    early_stop_dict[objective.i] = lst_early_rounds
    joblib.dump(early_stop_dict, path_early_stop_dict)
    objective.i +=1

    return score


hasattr(objective,'i')

False


%%time

# NOTE: there is inherent non-determinism in optuna hyperparameter selection
#       we may not get the same hyperparameters when run twice.


if not hasattr(objective,'i'):
    objective.i = len(study.trials)
    
N_TRIALS = 1 # make it large
study.optimize(objective, n_trials=N_TRIALS,timeout=600)

print(f'Number of finished trials: {len(study.trials)}')
params_best = study.best_trial.params
model = LGBMClassifier(**params_best,verbose=0,random_state=SEED)

model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)
profit = get_profit(ytest,ypreds)
print(f"profit = ${profit:,d}")

Number of finished trials: 1
profit = $18,100
CPU times: user 57 s, sys: 2.7 s, total: 59.7 s
Wall time: 24.9 s


show_methods(study)


# study.get_trials()
# FrozenTrial starting from number=0 ,1, ...


study.best_trial

FrozenTrial(number=0, value=44220.0, datetime_start=datetime.datetime(2020, 12, 30, 13, 58, 53, 719337), datetime_complete=datetime.datetime(2020, 12, 30, 13, 58, 57, 943567), params={'colsample_bytree': 0.7263699514296151, 'learning_rate': 0.112918456991973, 'max_depth': 13, 'n_estimators': 4027, 'reg_alpha': 0.1651439692851411, 'reg_lambda': 0.1946025113317489, 'scale_pos_weight': 9, 'subsample': 0.6050538712212918}, distributions={'colsample_bytree': UniformDistribution(high=1, low=0.5), 'learning_rate': LogUniformDistribution(high=1.0, low=0.01), 'max_depth': IntUniformDistribution(high=16, low=3, step=1), 'n_estimators': IntUniformDistribution(high=5000, low=100, step=1), 'reg_alpha': UniformDistribution(high=1, low=0.01), 'reg_lambda': UniformDistribution(high=1, low=0.01), 'scale_pos_weight': CategoricalDistribution(choices=(2, 3, 4, 5, 7, 8, 9, 10)), 'subsample': UniformDistribution(high=1, low=0.5)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1, state=TrialState.COMPLETE)


lst_best_n_estimators = early_stop_dict[study.best_trial.number]

best_n_estimators = np.max(early_stop_dict[study.best_trial.number]) 
lst_best_n_estimators, best_n_estimators

([104, 104, 104, 104, 104], 104)


%%time
# Resume from last time
N_TRIALS = 100 # make it large

study = optuna.create_study(**params_optuna_study)
study.optimize(objective,
               n_trials=N_TRIALS,
               timeout=60*10, # 60 means 1 minutes
               show_progress_bar=True)

params_best = study.best_trial.params
print(f'Number of finished trials: {len(study.trials)}')
print(params_best)
print()


model = LGBMClassifier(**params_best,verbose=0,random_state=SEED)
model.fit(df_Xtrain,ytrain)
vdpreds = model.predict(df_Xvalid)
profit = get_profit(yvalid,vdpreds)
print(f"validation profit = ${profit:,d}")

# test
model = LGBMClassifier(**params_best,verbose=0,random_state=SEED)
model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)
profit = get_profit(ytest,ypreds)
print(f"test profit       = ${profit:,d}")

out = """

Number of finished trials: 101
{'colsample_bytree': 0.8340780751883098, 'learning_rate': 0.6075408640727997, 'max_depth': 10, 'n_estimators': 3553, 'reg_alpha': 0.6850804855842425, 'reg_lambda': 0.7721552432132202, 'scale_pos_weight': 7, 'subsample': 0.9578772811050695}

validation profit = $29,400
test profit       = $22,200
-------------------------------------------------------------------------
Number of finished trials: 202
{'colsample_bytree': 0.8340780751883098, 'learning_rate': 0.6075408640727997, 'max_depth': 10, 'n_estimators': 3553, 'reg_alpha': 0.6850804855842425, 'reg_lambda': 0.7721552432132202, 'scale_pos_weight': 7, 'subsample': 0.9578772811050695}

validation profit = $32,400
test profit       = $18,200

When I run more iterations, validation improved but test decreased.
We should never look at test set because we don't have ytest labels.
We should check standard deviation of valid splits, and do more iterations.

"""

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/optuna/progress_bar.py:46: ExperimentalWarning:

Progress bar is experimental (supported from v1.2.0). The interface can change in the future.

Number of finished trials: 101
{'colsample_bytree': 0.7313627571689273, 'learning_rate': 0.5060823593539764, 'max_depth': 15, 'n_estimators': 2914, 'reg_alpha': 0.06466622093877408, 'reg_lambda': 0.32005752267806997, 'scale_pos_weight': 9, 'subsample': 0.9034618429683745}

validation profit = $22,100
test profit       = $17,500
CPU times: user 16min 7s, sys: 50 s, total: 16min 57s
Wall time: 8min 8s


show_methods(optuna.visualization)


optuna.visualization.plot_optimization_history(study)


%%time
# optuna.visualization.plot_param_importances(study)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs


fig = optuna.visualization.plot_parallel_coordinate(study)
fig['layout']['width'] = 800

fig.show()


optuna.visualization.plot_slice(study,
    params=['learning_rate','max_depth'])


# optuna.visualization.plot_contour(study,params=['learning_rate','max_depth'])


profit = get_profit(ytest,ypreds)
print(f"test profit       = ${profit:,d}")

model_eval_bin(f'{model_name}+{hpo_name}',ytest,ypreds,yprobs2d,show_plots=True)

test profit       = $17,500
              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1035
           1       0.53      0.48      0.50       374

    accuracy                           0.75      1409
   macro avg       0.67      0.66      0.67      1409
weighted avg       0.74      0.75      0.74      1409

[[872 163]
 [193 181]]

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:86: FutureWarning:

Function plot_roc_curve is deprecated; This will be removed in v0.5.0. Please use scikitplot.metrics.plot_roc instead.


import shap
shap.initjs()


explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


df_Xtest.head(2)


# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
                shap_values[idx,:],
                df_Xtest.iloc[idx,:],
                matplotlib=False,
                text_rotation=90)

# for this row, the predicted label is ...
# red features makes it higher
# blue features makes it smaller.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-62-0f954fe3c3ee> in <module>
      3 idx = 0
      4 shap.force_plot(explainer.expected_value,
----> 5                 shap_values[idx,:],
      6                 df_Xtest.iloc[idx,:],
      7                 matplotlib=False,

TypeError: list indices must be integers or slices, not tuple


shap.summary_plot(shap_values, df_Xtest)


shap.summary_plot(shap_values, df_Xtest, plot_type='bar')


shap.dependence_plot(ind='TotalCharges', interaction_index='tenure',
                     shap_values=shap_values, 
                     features=df_Xtest,  
                     display_features=df_Xtest)


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

	tenure	MonthlyCharges	TotalCharges	Contract_TotalCharges_mean	SeniorCitizen_Partner_No_No	SeniorCitizen_Partner_No_Yes	SeniorCitizen_Partner_Yes_No	SeniorCitizen_TechSupport_No_No internet service	SeniorCitizen_TechSupport_No_Yes	SeniorCitizen_TechSupport_Yes_No internet service	Partner_No	Partner_Yes	PaymentMethod_Bank transfer (automatic)	PaymentMethod_Credit card (automatic)	PaymentMethod_Mailed check	InternetService_DSL	InternetService_Fiber optic	InternetService_No	Partner_Dependents_No_No	Partner_Dependents_Yes_No	Partner_Dependents_Yes_Yes	MultipleLines_No	MultipleLines_Yes	TechSupport_No internet service	TechSupport_Yes	OnlineSecurity_No internet service	OnlineSecurity_Yes	SeniorCitizen_PaymentMethod_No_Bank transfer (automatic)	SeniorCitizen_PaymentMethod_No_Credit card (automatic)	SeniorCitizen_PaymentMethod_No_Mailed check	SeniorCitizen_PaymentMethod_Yes_Mailed check	SeniorCitizen_Dependents_No_No	SeniorCitizen_Dependents_No_Yes	SeniorCitizen_Dependents_Yes_No	DeviceProtection_No	DeviceProtection_No internet service	DeviceProtection_Yes	StreamingMovies_No	StreamingMovies_No internet service	StreamingMovies_Yes	Dependents_No	Dependents_Yes	PaperlessBilling_No	PaperlessBilling_Yes	SeniorCitizen_Contract_No_Month-to-month	SeniorCitizen_Contract_No_Two year	SeniorCitizen_Contract_Yes_Month-to-month	PhoneService_Yes	Contract_Month-to-month	Contract_Two year	SeniorCitizen_No	SeniorCitizen_Yes	StreamingTV_No	StreamingTV_No internet service	StreamingTV_Yes	OnlineBackup_No	OnlineBackup_No internet service	OnlineBackup_Yes
0	36	106.05	3834.40	3683.643192	0	1	0	0	1	0	0	1	0	1	0	0	1	0	0	1	0	0	1	0	1	0	1	0	1	0	0	1	0	0	0	0	1	0	0	1	1	0	0	1	0	1	0	1	0	1	1	0	1	0	0	0	0	1
1	10	62.25	612.95	1370.923131	1	0	0	0	1	0	1	0	1	0	0	1	0	0	1	0	0	1	0	0	1	0	1	1	0	0	0	1	0	0	1	0	0	1	0	0	1	0	1	0	1	0	0	1	1	0	1	0	0	0	1	1	0	0
2	25	19.15	477.60	1370.923131	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	1	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	0	0	1	0	1	1	0	0	1	1	0	1	0	0	1	0	0	1	0
3	7	20.00	137.60	1370.923131	1	0	0	1	0	0	1	0	1	0	0	0	0	1	1	0	0	1	0	1	0	1	0	1	0	0	0	1	0	0	0	1	0	0	1	0	1	0	1	0	1	0	0	1	1	0	1	0	0	1	0	0	1	0
4	24	20.30	459.95	1370.923131	0	0	1	0	0	1	1	0	0	0	1	0	0	1	1	0	0	1	0	1	0	1	0	0	0	0	1	0	0	1	0	1	0	0	1	0	1	0	0	1	0	0	1	1	1	0	0	1	0	1	0	0	1	0

	tenure	MonthlyCharges	TotalCharges	Contract_TotalCharges_mean	SeniorCitizen_Partner_No_No	SeniorCitizen_Partner_No_Yes	SeniorCitizen_Partner_Yes_No	SeniorCitizen_Partner_Yes_Yes	SeniorCitizen_TechSupport_No_No	SeniorCitizen_TechSupport_No_No internet service	SeniorCitizen_TechSupport_No_Yes	SeniorCitizen_TechSupport_Yes_No	SeniorCitizen_TechSupport_Yes_No internet service	SeniorCitizen_TechSupport_Yes_Yes	Partner_No	Partner_Yes	PaymentMethod_Bank transfer (automatic)	PaymentMethod_Credit card (automatic)	PaymentMethod_Electronic check	PaymentMethod_Mailed check	InternetService_DSL	InternetService_Fiber optic	InternetService_No	Partner_Dependents_No_No	Partner_Dependents_No_Yes	Partner_Dependents_Yes_No	Partner_Dependents_Yes_Yes	MultipleLines_No	MultipleLines_No phone service	MultipleLines_Yes	TechSupport_No	TechSupport_No internet service	TechSupport_Yes	OnlineSecurity_No	OnlineSecurity_No internet service	OnlineSecurity_Yes	SeniorCitizen_PaymentMethod_No_Bank transfer (automatic)	SeniorCitizen_PaymentMethod_No_Credit card (automatic)	SeniorCitizen_PaymentMethod_No_Electronic check	SeniorCitizen_PaymentMethod_No_Mailed check	SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic)	SeniorCitizen_PaymentMethod_Yes_Credit card (automatic)	SeniorCitizen_PaymentMethod_Yes_Electronic check	SeniorCitizen_PaymentMethod_Yes_Mailed check	SeniorCitizen_Dependents_No_No	SeniorCitizen_Dependents_No_Yes	SeniorCitizen_Dependents_Yes_No	SeniorCitizen_Dependents_Yes_Yes	DeviceProtection_No	DeviceProtection_No internet service	DeviceProtection_Yes	StreamingMovies_No	StreamingMovies_No internet service	StreamingMovies_Yes	Dependents_No	Dependents_Yes	PaperlessBilling_No	PaperlessBilling_Yes	SeniorCitizen_Contract_No_Month-to-month	SeniorCitizen_Contract_No_One year	SeniorCitizen_Contract_No_Two year	SeniorCitizen_Contract_Yes_Month-to-month	SeniorCitizen_Contract_Yes_One year	SeniorCitizen_Contract_Yes_Two year	PhoneService_No	PhoneService_Yes	Contract_Month-to-month	Contract_One year	Contract_Two year	SeniorCitizen_No	SeniorCitizen_Yes	StreamingTV_No	StreamingTV_No internet service	StreamingTV_Yes	OnlineBackup_No	OnlineBackup_No internet service	OnlineBackup_Yes
4555	16	19.75	294.90	1370.923131	1	0	0	0	0	1	0	0	0	0	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	0	0	0	0	1	0	0	0	0	1	0	0	1	0	1	0	1	0	1	0	0	0	0	0	0	1	1	0	0	1	0	0	1	0	0	1	0
3379	72	64.70	4746.05	3683.643192	0	1	0	0	0	0	1	0	0	0	0	1	0	0	1	0	1	0	0	0	0	1	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	0	0	0	1	0	0	0	0	0	1	0	0	1	1	0	0	1	0	0	1	0	0	0	1	0	0	0	1	1	0	0	0	1	0	0	1

	0	1	2	3
0	best_iteration_	fit	n_estimators	reg_alpha
1	best_score_	get_params	n_features_	reg_lambda
2	booster_	importance_type	n_jobs	score
3	boosting_type	learning_rate	num_leaves	set_params
4	class_weight	max_depth	objective	silent
5	classes_	min_child_samples	objective_	subsample
6	colsample_bytree	min_child_weight	predict	subsample_for_bin
7	evals_result_	min_split_gain	predict_proba	subsample_freq
8	feature_importances_	n_classes_	random_state

	0	1	2	3
0	Any	delete_study	load_study	structs
1	Study	distributions	logging	study
2	TYPE_CHECKING	exceptions	multi_objective	trial
3	Trial	get_all_study_summaries	progress_bar	type_checking
4	TrialPruned	importance	pruners	types
5	create_study	importlib	samplers	version
6	create_trial	integration	storages	visualization
7	dashboard

	tenure	MonthlyCharges	TotalCharges	Contract_TotalCharges_mean	SeniorCitizen_Partner_No_No	SeniorCitizen_Partner_No_Yes	SeniorCitizen_Partner_Yes_No	SeniorCitizen_Partner_Yes_Yes	SeniorCitizen_TechSupport_No_No	SeniorCitizen_TechSupport_No_No internet service	SeniorCitizen_TechSupport_No_Yes	SeniorCitizen_TechSupport_Yes_No	SeniorCitizen_TechSupport_Yes_No internet service	SeniorCitizen_TechSupport_Yes_Yes	Partner_No	Partner_Yes	PaymentMethod_Bank transfer (automatic)	PaymentMethod_Credit card (automatic)	PaymentMethod_Electronic check	PaymentMethod_Mailed check	InternetService_DSL	InternetService_Fiber optic	InternetService_No	Partner_Dependents_No_No	Partner_Dependents_No_Yes	Partner_Dependents_Yes_No	Partner_Dependents_Yes_Yes	MultipleLines_No	MultipleLines_No phone service	MultipleLines_Yes	TechSupport_No	TechSupport_No internet service	TechSupport_Yes	OnlineSecurity_No	OnlineSecurity_No internet service	OnlineSecurity_Yes	SeniorCitizen_PaymentMethod_No_Bank transfer (automatic)	SeniorCitizen_PaymentMethod_No_Credit card (automatic)	SeniorCitizen_PaymentMethod_No_Electronic check	SeniorCitizen_PaymentMethod_No_Mailed check	SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic)	SeniorCitizen_PaymentMethod_Yes_Credit card (automatic)	SeniorCitizen_PaymentMethod_Yes_Electronic check	SeniorCitizen_PaymentMethod_Yes_Mailed check	SeniorCitizen_Dependents_No_No	SeniorCitizen_Dependents_No_Yes	SeniorCitizen_Dependents_Yes_No	SeniorCitizen_Dependents_Yes_Yes	DeviceProtection_No	DeviceProtection_No internet service	DeviceProtection_Yes	StreamingMovies_No	StreamingMovies_No internet service	StreamingMovies_Yes	Dependents_No	Dependents_Yes	PaperlessBilling_No	PaperlessBilling_Yes	SeniorCitizen_Contract_No_Month-to-month	SeniorCitizen_Contract_No_One year	SeniorCitizen_Contract_No_Two year	SeniorCitizen_Contract_Yes_Month-to-month	SeniorCitizen_Contract_Yes_One year	SeniorCitizen_Contract_Yes_Two year	PhoneService_No	PhoneService_Yes	Contract_Month-to-month	Contract_One year	Contract_Two year	SeniorCitizen_No	SeniorCitizen_Yes	StreamingTV_No	StreamingTV_No internet service	StreamingTV_Yes	OnlineBackup_No	OnlineBackup_No internet service	OnlineBackup_Yes
0	1	48.6	48.6	3683.643192	1	0	0	0	1	0	0	0	0	0	1	0	0	1	0	0	1	0	0	1	0	0	0	1	0	0	1	0	0	0	0	1	0	1	0	0	0	0	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	1	0	0	0	0	0	0	1	1	0	0	1	0	1	0	0	1	0	0
1	56	99.9	5706.3	1370.923131	0	0	1	0	0	0	0	0	0	1	1	0	1	0	0	0	0	1	0	1	0	0	0	0	0	1	0	0	1	1	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	0	1	1	0	0	1	0	0	1	0	0	0	0	0	1	0	1	0	0	1	0	1	0	0	1	0	0	1

Modelling Customer Churn using LightGBM

Load the libraries

Colab¶

Useful Scripts

Load the Data

Data Processing

Data Processing¶

Data Types¶

Train and Test Data¶

Numerical and Categorical Features¶

Custom Features¶

Train Validation Split¶

Modelling

LightGBM HPO Using Optuna¶

Optuna Visualization

Model Evaluation

Model Evaluation using SHAP¶

Time Taken

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	1621-YNCJH	Female	Yes	No	36	Yes	Yes	Fiber optic	Yes	Yes	Yes	Yes	No	Yes	Two year	Yes	Credit card (automatic)	106.05	3834.4	No
1	7143-BQIBA	Male	No	No	10	Yes	No	DSL	Yes	No	No	Yes	Yes	No	Month-to-month	No	Bank transfer (automatic)	62.25	612.95	No
5632	0862-PRCBS	Female	Yes	Yes	68	Yes	Yes	Fiber optic	No	Yes	No	Yes	Yes	Yes	Two year	Yes	Credit card (automatic)	103.75	7039.45	No
5633	4656-CAURT	Male	No	No	69	Yes	Yes	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Bank transfer (automatic)	23.95	1713.1	No

	0	1	2	3
0	datetime_start	report	suggest_categorical	suggest_loguniform
1	distributions	set_system_attr	suggest_discrete_uniform	suggest_uniform
2	number	set_user_attr	suggest_float	system_attrs
3	params	should_prune	suggest_int	user_attrs

	0	1	2	3
0	add_trial	enqueue_trial	set_system_attr	system_attrs
1	best_params	get_trials	set_user_attr	trials
2	best_trial	optimize	stop	trials_dataframe
3	best_value	pruner	study_name	user_attrs
4	direction	sampler

	0	1	2	3
0	is_available	plot_edf	plot_optimization_history	plot_param_importances
1	plot_contour	plot_intermediate_values	plot_parallel_coordinate	plot_slice