import time

time_start_notebook = time.time()


%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot

    # HPO
    !git clone https://github.com/thuijskens/scikit-hyperband.git
    sys.path.append('scikit-hyperband/hyperband')

    # update modules
    !pip uninstall xgboost
    !pip install -U xgboost

    print('Environment: Google Colab')


sys.path.append('/Users/poudel/Dropbox/a00_Resources/hyperband')
from hyperband_search import HyperbandSearchCV


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px

# modelling
from sklearn.preprocessing import OneHotEncoder
import imblearn
from imblearn.over_sampling import SMOTE
import sklearn.metrics as skmetrics

# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

# boosting
from lightgbm import LGBMClassifier

# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

%matplotlib inline
%load_ext watermark
%watermark -iv

joblib         0.17.0
imblearn       0.7.0
json           2.0.9
numpy          1.19.4
pandas         1.1.4
seaborn        0.11.0
plotly_express 0.4.1
autopep8       1.5.2


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    if show_plots:
        skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
        skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
        skpmetrics.plot_confusion_matrix(ytest,ypreds)


class FrequencyEncoder:
    def __init__(self, cols):
        self.cols = cols
        self.counts_dict = None

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        counts_dict = {}
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict[col] = dict(zip(values, counts))
        self.counts_dict = counts_dict

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        counts_dict_test = {}
        res = []
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict_test[col] = dict(zip(values, counts))

            # if value is in "train" keys - replace "test" counts with "train" counts
            for k in [key for key in counts_dict_test[col].keys() if key in self.counts_dict[col].keys()]:
                counts_dict_test[col][k] = self.counts_dict[col][k]

            res.append(X[col].map(counts_dict_test[col]).values.reshape(-1, 1))
        res = np.hstack(res)

        X[self.cols] = res
        return X

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        self.fit(X, y)
        X = self.transform(X)
        return X


def get_profit(y_true, y_pred):
    tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
    profit = 400*tp - 200*fn - 100*fp
    return profit


path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'

if ENV_COLAB:
    path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
    path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'


df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)

print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))

(5634, 21)
(1409, 21)


target_name = 'Churn'


import plotly_express as px


px.histogram(df_train, x=target_name,height=300,width=300)


px.histogram(df_train, x='gender', color=target_name,height=300,width=300)


df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)


df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})


df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)

ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})

ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()


df_Xtrain.head(2)


cols_num = list(df_train.select_dtypes('number').columns)
cols_num

['tenure', 'MonthlyCharges', 'TotalCharges']


cols_cat = list(df_train.select_dtypes('object').columns)

# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']

print(cols_cat)

['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']


cols_num = ['TotalCharges','tenure', 'MonthlyCharges']


cols_num_old = cols_num
cols_cat_old = cols_cat


def combine_two_features(dfx,A,B):
    dfx = dfx.copy()
    assert len(A) == len(B)
    for a,b in zip(A,B):
        dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]

    return dfx

combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
                       'TechSupport','PaymentMethod']

cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]


cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)

df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)

['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']


def create_groupby_features(dfx,cat,num,agg):
    dfx = dfx.copy()
    for c in cat:
        for n in num:
            for a in agg:
                name = f"{c}_{n}_{a}"
                dfx[name] = df_train.groupby(c)[n].transform(a)
    return dfx


# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']

cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']

cols_num_new = [f'{c}_{n}_{a}' 
                for c in cols_grpcat
                for n in cols_grpnum
                for a in cols_grpagg]

cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)

df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)

['Contract_TotalCharges_mean']


from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer


pipe_num = Pipeline([
    
    # standard scaling
#     ('scaler', StandardScaler())
    
    # we can use yeo-johnson scaling for not-normal data instead of standard
    ('yeo_johnson', PowerTransformer())
])



pipe_cat = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

pipe_cat_freq = Pipeline([
    ('freq_enc', FrequencyEncoder(cols=cols_cat)),
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', pipe_num, cols_num),
        ('cat',  pipe_cat, cols_cat)
            ],
    remainder='drop'
)


from sklearn import set_config
set_config(display='diagram')

preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('yeo_johnson',
                                                  PowerTransformer())]),
                                 ['Contract_TotalCharges_mean', 'tenure',
                                  'MonthlyCharges', 'TotalCharges']),
                                ('cat',
                                 Pipeline(steps=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['PaperlessBilling', 'SeniorCitizen_Contract',
                                  'MultipleLines', 'SeniorCitizen',
                                  'PhoneService', 'DeviceProtection',
                                  'OnlineSecurity', 'StreamingMovies',
                                  'SeniorCitizen_PaymentMethod', 'OnlineBackup',
                                  'SeniorCitizen_Partner', 'Dependents',
                                  'Partner_Dependents', 'Partner',
                                  'StreamingTV', 'SeniorCitizen_Dependents',
                                  'Contract', 'SeniorCitizen_TechSupport',
                                  'TechSupport', 'InternetService',
                                  'PaymentMethod'])])

['Contract_TotalCharges_mean', 'tenure', 'MonthlyCharges', 'TotalCharges']

PowerTransformer()

['PaperlessBilling', 'SeniorCitizen_Contract', 'MultipleLines', 'SeniorCitizen', 'PhoneService', 'DeviceProtection', 'OnlineSecurity', 'StreamingMovies', 'SeniorCitizen_PaymentMethod', 'OnlineBackup', 'SeniorCitizen_Partner', 'Dependents', 'Partner_Dependents', 'Partner', 'StreamingTV', 'SeniorCitizen_Dependents', 'Contract', 'SeniorCitizen_TechSupport', 'TechSupport', 'InternetService', 'PaymentMethod']

OneHotEncoder(handle_unknown='ignore')


from sklearn import set_config
set_config(display='text')


print(df_Xtrain.shape, df_Xtest.shape)
df_Xtrain.head(2)

(5634, 27) (1409, 27)


preprocessor.fit(df_Xtrain)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('yeo_johnson',
                                                  PowerTransformer())]),
                                 ['Contract_TotalCharges_mean', 'tenure',
                                  'MonthlyCharges', 'TotalCharges']),
                                ('cat',
                                 Pipeline(steps=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['PaperlessBilling', 'SeniorCitizen_Contract',
                                  'MultipleLines', 'SeniorCitizen',
                                  'PhoneService', 'DeviceProtection',
                                  'OnlineSecurity', 'StreamingMovies',
                                  'SeniorCitizen_PaymentMethod', 'OnlineBackup',
                                  'SeniorCitizen_Partner', 'Dependents',
                                  'Partner_Dependents', 'Partner',
                                  'StreamingTV', 'SeniorCitizen_Dependents',
                                  'Contract', 'SeniorCitizen_TechSupport',
                                  'TechSupport', 'InternetService',
                                  'PaymentMethod'])])


Xtrain = preprocessor.transform(df_Xtrain)
Xtest  = preprocessor.transform(df_Xtest)
Xtrain.shape, Xtest.shape

((5634, 77), (1409, 77))


model_name = 'lightgbm'
hpo_name = 'hyperband'


from lightgbm import LGBMClassifier


model = LGBMClassifier(random_state=SEED)
model.fit(Xtrain,ytrain)
ypreds = model.predict(Xtest)
yprobs2d = model.predict_proba(Xtest)

model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=False)

profit = get_profit(ytest,ypreds)
print(f"test profit       = ${profit:,d}")

              precision    recall  f1-score   support

           0       0.78      0.92      0.85      1035
           1       0.57      0.28      0.37       374

    accuracy                           0.75      1409
   macro avg       0.67      0.60      0.61      1409
weighted avg       0.72      0.75      0.72      1409

[[956  79]
 [271 103]]

test profit       = $-20,900


%%time

import scipy.stats as stats
from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning

warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)

# Define our model
params_fixed = dict(boosting_type = 'gbdt',random_state= SEED,n_jobs=1)


params_hyp = {
    'max_depth'        : stats.randint(3,12),
    'learning_rate'    : stats.loguniform(0.01, 1.0),
    'n_estimators'     : stats.randint(100, 1000),
    'subsample'        : [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha'        : stats.loguniform(0.01, 1.0),
    'reg_lambda'       : stats.loguniform(0.01, 1.0),
    'scale_pos_weight' : [1,2,3,4,5]
    }


model = LGBMClassifier(**params_fixed)

# scoring ='roc_auc'
scoring = metrics.make_scorer(get_profit,greater_is_better=True)

# Perform Hyperparameter Tuning
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=SEED)
grid = HyperbandSearchCV(model,params_hyp, 
                              resource_param = 'n_estimators',
                              min_iter       = 100,
                              max_iter       = 5000, # use 1k or 2k
                              cv             = cv, 
                              scoring        = scoring, 
                              refit          = True,
                              verbose        = 0,
                              random_state   = SEED
                          )


# grid.fit(Xtrain, ytrain)
# print('Best parameters:  ', grid.best_params_)

# params_best = grid.best_params_
# params = params_fixed
# params.update(params_best)
# print(params)


out = """
this gives profit = $51,300
params_best = {'boosting_type': 'gbdt', 'random_state': 100,
               'n_jobs': 1, 'learning_rate': 0.026206211651810026,
               'max_depth': 3, 'n_estimators': 333,
               'reg_alpha': 0.016472282465672092,
               'reg_lambda': 0.027503944643483897,
               'scale_pos_weight': 4, 'subsample': 1.0}


--------------------------------
Best parameters:
Wall time: 29min 45s
params_best = {'boosting_type': 'gbdt', 'random_state': 100,
                'n_jobs': 1,'learning_rate': 0.026206211651810026,
                'max_depth': 3,'n_estimators': 185,
                'reg_alpha': 0.016472282465672092,
                'reg_lambda': 0.027503944643483897,
                'scale_pos_weight': 4,
                'subsample': 1.0}

this gives $48k
"""

Best parameters:   {'learning_rate': 0.026206211651810026, 'max_depth': 3, 'n_estimators': 185, 'reg_alpha': 0.016472282465672092, 'reg_lambda': 0.027503944643483897, 'scale_pos_weight': 4, 'subsample': 1.0}
{'boosting_type': 'gbdt', 'random_state': 100, 'n_jobs': 1, 'learning_rate': 0.026206211651810026, 'max_depth': 3, 'n_estimators': 185, 'reg_alpha': 0.016472282465672092, 'reg_lambda': 0.027503944643483897, 'scale_pos_weight': 4, 'subsample': 1.0}
CPU times: user 26min 55s, sys: 6.89 s, total: 27min 2s
Wall time: 29min 45s


params_best = {'boosting_type': 'gbdt', 'random_state': 100,
               'n_jobs': 1, 'learning_rate': 0.026206211651810026,
               'max_depth': 3, 'n_estimators': 333,
               'reg_alpha': 0.016472282465672092,
               'reg_lambda': 0.027503944643483897,
               'scale_pos_weight': 4, 'subsample': 1.0}


model = LGBMClassifier(**params_best)

model.fit(Xtrain,ytrain)
ypreds = model.predict(Xtest)

yprobs2d = model.predict_proba(Xtest)
model_eval_bin(f'{model_name}+{hpo_name}',ytest,ypreds,yprobs2d,show_plots=False)


profit = get_profit(ytest,ypreds)
print(f"test profit       = ${profit:,d}")

              precision    recall  f1-score   support

           0       0.87      0.71      0.78      1035
           1       0.47      0.70      0.56       374

    accuracy                           0.71      1409
   macro avg       0.67      0.70      0.67      1409
weighted avg       0.76      0.71      0.72      1409

[[736 299]
 [114 260]]

test profit       = $51,300


model_eval_bin(f"{model_name}+{hpo_name}",ytest,ypreds,yprobs2d,show_plots=True)

profit = get_profit(ytest,ypreds)
print(f"test profit       = ${profit:,d}")

              precision    recall  f1-score   support

           0       0.87      0.71      0.78      1035
           1       0.47      0.70      0.56       374

    accuracy                           0.71      1409
   macro avg       0.67      0.70      0.67      1409
weighted avg       0.76      0.71      0.72      1409

[[736 299]
 [114 260]]

test profit       = $51,300


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 5 secs

Modelling Customer Churn using LightGBM

Load the libraries

Colab¶

Useful Scripts

Load the Data

Data Processing

Data Processing¶

Data Types¶

Train and Test Data¶

Numerical and Categorical Features¶

Custom Features¶

Data Processing Pipeline¶

Modelling

Hyperband SearchCV¶

Model Evaluation

Time Taken

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	1621-YNCJH	Female	Yes	No	36	Yes	Yes	Fiber optic	Yes	Yes	Yes	Yes	No	Yes	Two year	Yes	Credit card (automatic)	106.05	3834.4	No
1	7143-BQIBA	Male	No	No	10	Yes	No	DSL	Yes	No	No	Yes	Yes	No	Month-to-month	No	Bank transfer (automatic)	62.25	612.95	No
5632	0862-PRCBS	Female	Yes	Yes	68	Yes	Yes	Fiber optic	No	Yes	No	Yes	Yes	Yes	Two year	Yes	Credit card (automatic)	103.75	7039.45	No
5633	4656-CAURT	Male	No	No	69	Yes	Yes	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Bank transfer (automatic)	23.95	1713.1	No