import time

time_start_notebook = time.time()


%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot

    # HPO for pycaret
    !pip install tune-sklearn
    !pip install optuna # hyperopt is already in colab

    # gpu version of lightgbm for pycaret
    !pip uninstall lightgbm -y
    !pip install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so"

    # regular pycaret without gpu 
    !pip install pycaret-nightly[full]

    # ipywidget
    !pip install ipywidgets
    !jupyter nbextension enable --py widgetsnbextension

    from pycaret.utils import enable_colab
    enable_colab()

    print('Environment: Google Colab')


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import joblib

from tqdm import tqdm_notebook as tqdm

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

# special
import pycaret

%load_ext watermark
%watermark -iv

joblib    : 1.0.0
pandas    : 1.1.5
numpy     : 1.19.4
seaborn   : 0.11.0
pycaret   : 2.2.2
autopep8  : 1.5.4
sys       : 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
json      : 2.0.9
matplotlib: 3.2.2


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


def compare_new_models(name,desc,mean_row,ofile,
                       df_eval=None,sort='Recall',show=True):
    """Create dataframe from output of pycaret new model.
    Parameters
    -----------
    name: str
        Name of the model. eg. xgboost
    desc: str
        Description of the model. e.g tuned,calibrated
    mean_arr: np.ndarray
        The mean row.
        e.g. 
        df_res = pyc.pull()
        mean_row = df_res.loc['Mean']
    ofile: str
        Output file name. e.g. 'pycaret_df_eval_lr.csv'
    df_eval: Pandas Dataframe
        Template pandas dataframe
    sort: str
       One of following string: Accuracy, AUC, Recall, Precision, F1, Kappa

    Returns:
       Pandas Dataframe.

    """
    
    if not isinstance(df_eval, pd.DataFrame):
        df_eval = pd.DataFrame({'Model': [],
                                'Description':[],
                                'Accuracy':[],
                                'AUC':[],
                                'Recall':[],
                                'Precision':[],
                                'F1':[],
                                'Kappa':[],
                                'MCC': [],
                                'LogLoss': []
                               })

    acc,auc,rec,pre,f1,kap,mcc,logloss = mean_row
    row = [name,desc,acc,auc,rec,pre,f1,kap,mcc,logloss]

    df_eval.loc[len(df_eval)] = row
    df_eval = df_eval.drop_duplicates()\
                     .sort_values(sort,ascending=False)
    df_eval.index = range(len(df_eval))
    
    df_style = (df_eval.style.apply(lambda ser:
                ['background: tomato'
                 if ser.name == sort else ''
                 for _ in ser]))

    if show:
        display(df_style)

    # save the data
    df_eval.to_csv(ofile,index=False)
    
    return df_eval


path_data_train = '../data/processed/train_cleaned.csv'
path_data_test = '../data/processed/test_cleaned.csv'

if ENV_COLAB:
    path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/processed/train_cleaned.csv'
    path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/processed/test_cleaned.csv'


df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)

print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))

(5634, 39)
(1409, 39)


path_data_test_raw = ('https://raw.githubusercontent.com/'
                      'bhishanpdl/Datasets/master/Projects/'
                      'Telco_Customer_Churn/raw/test.csv')
                      
df_test_raw1 = pd.read_csv(path_data_test_raw,usecols=['customerID'])
df_test_raw1.head(2)


ser_test_ids = df_test_raw1['customerID']
target_name = 'Churn'


# check for nans
df_train.isna().sum().sum()

0


import pycaret
import pycaret.classification as pyc


USE_GPU = False
if ENV_COLAB:
    USE_GPU = True


# %%capture
exp = pyc.setup(df_train,target_name,
                train_size=0.8,
                session_id=SEED,
                use_gpu=USE_GPU,
                preprocess = True,
                categorical_features = None,
                ordinal_features = None,
                high_cardinality_features = None,
                numeric_features = None,
                date_features = None,
                ignore_features = None,
                normalize = False,
                data_split_stratify = True,
                silent=True,
                profile=False
                )

# use silent = True to check inferred datatypes
# then assign numeric and categorical features yourself.
#
# if sampling = False, 100% of data is used and its too slow
# if sampling = True, we need to enter number eg. 0.3 ourself.

"""
Here, we have data < 25k rows, so I have chosen not to use sampling.

""";


pyc.models(internal=True)[['Name', 'GPU Enabled']]

# google colab does not support cuml and thereby sklearn models
# we need to run blazingsql notebooks (not colab) to use cuml models.


# pyc.compare_models?


pyc.get_metrics().index

Index(['acc', 'auc', 'recall', 'precision', 'f1', 'kappa', 'mcc'], dtype='object', name='ID')


# add Log Loss metric in pycaret
from sklearn.metrics import log_loss
pyc.add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False)

Name                                                        LogLoss
Display Name                                                LogLoss
Score Function                <function log_loss at 0x7ffce25d8b80>
Scorer               make_scorer(log_loss, greater_is_better=False)
Target                                                         pred
Args                                                             {}
Greater is Better                                             False
Multiclass                                                     True
Custom                                                         True
Name: logloss, dtype: object


model_name = 'xgboost'
path_df_eval = 'pycaret_df_eval_xgb.csv'

model = pyc.create_model(model_name,verbose=False)

mean_row = pyc.pull().loc['Mean'].values
df_eval = compare_new_models(model_name,'default',
                mean_row,path_df_eval,sort='Recall',df_eval=None)


# %%capture
# model_tuned1 = pyc.tune_model(model,n_iter=100,search_library='tune-sklearn',
#                               fold=5,optimize='Recall',verbose=False)
# mean_row = pyc.pull().loc['Mean']

# desc = 'tuned,tune-sklearn,n_iter=100'
# df_eval = compare_new_models(model_name,desc,mean_row,path_df_eval,
#                              sort='Recall',df_eval=df_eval)


# %%capture
# model_tuned2 = pyc.tune_model(model,n_iter=500,search_library='optuna',
#                               fold=5,optimize='Recall',verbose=False)
# mean_row = pyc.pull().loc['Mean']

# desc = 'tuned,optuna,n_iter=500'
# df_eval = compare_new_models(model_name,desc,mean_row,path_df_eval,
#                              sort='Recall',df_eval=df_eval)


# This is BAD, it gave me Recall = 1, too much overfitting
#              everything is classified as not-churn
#              USE AUC instead of Recall


# %%capture
# model_tuned_AUC = pyc.tune_model(model,n_iter=500,search_library='optuna',
#                               fold=5,optimize='AUC',verbose=False)
# mean_row = pyc.pull().loc['Mean']

# desc = 'tuned,optuna,n_iter=500,optimize=AUC'
# df_eval = compare_new_models(model_name,desc,mean_row,path_df_eval,sort='Recall',df_eval=df_eval)

# THIS GAVE ME WORSE RESULT THAN F1


# %%capture

# # This takes 1 hr on local computer, comment after HPO
# model_tuned_F1 = pyc.tune_model(model,n_iter=500,search_library='optuna',
#                                fold=5,optimize='F1',verbose=False)
# mean_row = pyc.pull().loc['Mean']

# desc = 'tuned,optuna,n_iter=500,optimize=F1'
# df_eval = compare_new_models(model_name,desc,mean_row,path_df_eval,
#                              sort='Recall',df_eval=df_eval)


# model_best = model_tuned_F1


odir = '.' if ENV_COLAB else '../models/'
path_model_best = odir+f'pycaret_model_best_{model_name}.joblib'

# joblib.dump(model_best, path_model_best) # comment this


model_best = joblib.load(path_model_best)
model_best

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5404689403971603, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.016761554573600634, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=62, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=100,
              reg_alpha=6.0679925689388595e-09,
              reg_lambda=0.0012602601741511542,
              scale_pos_weight=2.0040690074466885,
              subsample=0.48109449185878156, tree_method='auto',
              use_label_encoder=True, validate_parameters=1, verbosity=0)


# model calibration is useless, we can do model evaluation.


# pyc.plot_model gives one by one plot, use 
# pyc.evaluate_model to get all the plots.

# evaluate model (click on buttons)
pyc.evaluate_model(model_best)


# interpret_model: SHAP
pyc.interpret_model(model_best)


# interpret model : Correlation
pyc.interpret_model(model_best,plot='correlation')


# interpret model : Reason
pyc.interpret_model(model_best,plot='reason',observation=12)


model_final = pyc.finalize_model(model_best)
print(model_final)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5404689403971603, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.016761554573600634, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=62, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=100,
              reg_alpha=6.0679925689388595e-09,
              reg_lambda=0.0012602601741511542,
              scale_pos_weight=2.0040690074466885,
              subsample=0.48109449185878156, tree_method='auto',
              use_label_encoder=True, validate_parameters=1, verbosity=0)


df_test.iloc[-5:,-5:]


df_preds = pyc.predict_model(model_final,df_test)
df_preds.iloc[-5:,-5:]


ytest = df_preds[target_name].to_numpy().ravel()
yprobs = df_preds['Score'].to_numpy().ravel()
ypreds = df_preds['Label'].to_numpy().ravel()

yprobs2d = np.c_[1-yprobs,yprobs]


pred_name = 'pycaret_xgboost'
path_pred = f'../predictions/{pred_name}.csv'

df_preds_out = pd.DataFrame({'customerID': ser_test_ids})
df_preds_out[f'ypreds_{pred_name}'] = ypreds
df_preds_out[f'yprobs_{pred_name}'] = yprobs2d[:,1]

df_preds_out.to_csv(path_pred,index=False)

df_preds_out.head()


def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    if show_plots:
        skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
        skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
        skpmetrics.plot_confusion_matrix(ytest,ypreds)

model_eval_bin('pycaret_xgboost',ytest,ypreds,yprobs2d,show_plots=True)

              precision    recall  f1-score   support

           0       0.89      0.79      0.84      1035
           1       0.56      0.74      0.64       374

    accuracy                           0.78      1409
   macro avg       0.73      0.76      0.74      1409
weighted avg       0.80      0.78      0.78      1409

[[816 219]
 [ 97 277]]


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 33 secs

	Gender	Partner	Dependents	Tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Contract_Month-to-month	NoSeniorCitizen_Contract_Month-to-month	InternetService_Fiber optic	StreamingTV_NoInternetService	No_OB_DP_TS	TotalServices	SenCit_Dependents	Partner_Dependents	SenCit_Partner	SenCit_Contract	SenCit_TechSupport	SenCit_PayMeth	Contract_mean_totCharges	Contract_totCharges_diff	PayMeth_mean_monthCharges	PayMeth_monthCharges_diff	Tenure_cat
0	0	1	0	36	1	2	1	2	2	2	2	0	2	2	1	1	106.05	3834.40	1	0	0	1	1	2	0	1	1	2	2	1	3683.643192	150.756808	66.703657	39.346343	3
1	1	0	0	10	1	0	0	2	0	0	2	2	0	0	0	1	62.25	612.95	0	1	1	1	1	1	0	0	0	0	2	1	1370.923131	-757.973131	66.703657	-4.453657	0
5632	0	1	1	68	1	2	1	0	2	0	2	2	2	2	1	1	103.75	7039.45	1	0	0	1	1	2	1	2	1	2	2	1	3683.643192	3355.806808	66.703657	37.046343	5
5633	1	0	0	69	1	2	2	1	1	1	1	1	1	2	0	1	23.95	1713.10	1	0	1	0	1	7	0	0	0	2	1	1	3683.643192	-1970.543192	66.703657	-42.753657	5

	Name	GPU Enabled
ID
lr	Logistic Regression	False
knn	K Neighbors Classifier	False
nb	Naive Bayes	False
dt	Decision Tree Classifier	False
svm	SVM - Linear Kernel	False
rbfsvm	SVM - Radial Kernel	False
gpc	Gaussian Process Classifier	False
mlp	MLP Classifier	False
ridge	Ridge Classifier	False
rf	Random Forest Classifier	False
qda	Quadratic Discriminant Analysis	False
ada	Ada Boost Classifier	False
gbc	Gradient Boosting Classifier	False
lda	Linear Discriminant Analysis	False
et	Extra Trees Classifier	False
xgboost	Extreme Gradient Boosting	False
lightgbm	Light Gradient Boosting Machine	False
catboost	CatBoost Classifier	False
Bagging	Bagging Classifier	False
Stacking	Stacking Classifier	False
Voting	Voting Classifier	False
CalibratedCV	Calibrated Classifier CV	False

	Contract_totCharges_diff	PayMeth_mean_monthCharges	PayMeth_monthCharges_diff	Tenure_cat
1404	-859.687095	63.914865	16.585135	0
1405	-1152.769579	63.914865	-19.764865	3
1406	-218.037095	63.914865	6.085135	1
1407	-77.487095	63.914865	30.735135	1
1408	-367.987095	67.437658	-2.787658	1

	PayMeth_monthCharges_diff	Tenure_cat	Label	Score
1404	16.585135	0	1	0.6279
1405	-19.764865	3	0	0.6988
1406	6.085135	1	1	0.6042
1407	30.735135	1	1	0.6533
1408	-2.787658	1	0	0.6158

	customerID	ypreds_pycaret_xgboost	yprobs_pycaret_xgboost
0	1794-HBQTJ	1	0.5433
1	0356-OBMAC	0	0.7239
2	4077-CROMM	1	0.5501
3	5442-PPTJY	0	0.8111
4	2333-KWEWW	0	0.8104

Modelling Customer Churn using pycaret

Load the libraries

Colab¶

Useful Scripts

Load the Data

Data Processing

Modelling Pycaret

Pycaret Setup

Comparing Models

Create Models

Hyperparameter Tuning

Save Model After HPO

Model Evaluation (Validation) : plot_model and evaluate_model

Model Interpretation of Tree Methods (Validation)

Finalize model (Fit whole train data)

Model Evaluation on Test Data

Time Taken

	Description	Value
0	session_id	100
1	Target	Churn
2	Target Type	Binary
3	Label Encoded	0: 0, 1: 1
4	Original Data	(5634, 39)
5	Missing Values	False
6	Numeric Features	6
7	Categorical Features	32
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(4507, 60)
12	Transformed Test Set	(1127, 60)
13	Shuffle Train-Test	True
14	Stratify Train-Test	True
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	False
19	Log Experiment	False
20	Experiment Name	clf-default-name
21	USI	5b54
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	mean
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	constant
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	False
30	Normalize Method	None
31	Transformation	False
32	Transformation Method	None
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Clustering	False
45	Clustering Iteration	None
46	Polynomial Features	False
47	Polynomial Degree	None
48	Trignometry Features	False
49	Polynomial Threshold	None
50	Group Features	False
51	Feature Selection	False
52	Features Selection Threshold	None
53	Feature Interaction	False
54	Feature Ratio	False
55	Interaction Threshold	None
56	Fix Imbalance	False
57	Fix Imbalance Method	SMOTE