import time

time_start_notebook = time.time()


%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
time_colab_start = time.time()

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot

    # HPO for pycaret
    !pip install tune-sklearn ray[tune] # search_library = 'sklearn_tune' dont work even if we install this
    !pip install optuna # hyperopt is already in colab

    # gpu version of lightgbm for pycaret
    !pip uninstall lightgbm -y
    !pip install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so"

    # regular pycaret without gpu 
    !pip install pycaret-nightly[full]

    # ipywidget
    !pip install ipywidgets
    !jupyter nbextension enable --py widgetsnbextension

    from pycaret.utils import enable_colab
    enable_colab()

    print('Environment: Google Colab')


time_colab = time.time() - time_colab_start
h,m = divmod(time_colab,60*60)
print('Time taken by colab: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken by colab: 0 hr 0 min 0 secs


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import joblib

from tqdm import tqdm_notebook as tqdm

# special
import pycaret

# settings
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

%matplotlib inline
%load_ext watermark
%watermark -iv

sys       : 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
pandas    : 1.1.5
numpy     : 1.19.4
autopep8  : 1.5.4
seaborn   : 0.11.0
pycaret   : 2.2.2
matplotlib: 3.2.2
joblib    : 1.0.0
json      : 2.0.9


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


def compare_new_models(name,desc,mean_row,ofile,
                       df_eval=None,sort='Recall',show=True):
    """Create dataframe from output of pycaret new model.
    Parameters
    -----------
    name: str
        Name of the model. eg. xgboost
    desc: str
        Description of the model. e.g tuned,calibrated
    mean_arr: np.ndarray
        The mean row.
        e.g. 
        df_res = pyc.pull()
        mean_row = df_res.loc['Mean']
    ofile: str
        Output file name. e.g. 'pycaret_df_eval_lr.csv'
    df_eval: Pandas Dataframe
        Template pandas dataframe
    sort: str
       One of following string: Accuracy, AUC, Recall, Precision, F1, Kappa

    Returns:
       Pandas Dataframe.
    """
    
    if not isinstance(df_eval, pd.DataFrame):
        df_eval = pd.DataFrame({'Model': [],
                                'Description':[],
                                'Accuracy':[],
                                'AUC':[],
                                'Recall':[],
                                'Precision':[],
                                'F1':[],
                                'Kappa':[],
                                'MCC': [],
                                'LogLoss': []
                               })

    acc,auc,rec,pre,f1,kap,mcc,logloss = mean_row
    row = [name,desc,acc,auc,rec,pre,f1,kap,mcc,logloss]

    df_eval.loc[len(df_eval)] = row
    df_eval = df_eval.drop_duplicates()\
                     .sort_values(sort,ascending=False)
    df_eval.index = range(len(df_eval))
    
    df_style = (df_eval.style.apply(lambda ser:
                ['background: tomato'
                 if ser.name == sort else ''
                 for _ in ser]))

    if show:
        display(df_style)

    # save the data
    df_eval.to_csv(ofile)
    
    return df_eval


path_data_train = '../data/processed/train_cleaned.csv'
path_data_test = '../data/processed/test_cleaned.csv'

if ENV_COLAB:
    path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/processed/train_cleaned.csv'
    path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/processed/test_cleaned.csv'


df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)

print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))

(5634, 39)
(1409, 39)


path_data_test_raw = ('https://raw.githubusercontent.com/'
                      'bhishanpdl/Datasets/master/Projects/'
                      'Telco_Customer_Churn/raw/test.csv')
                      
df_test_raw1 = pd.read_csv(path_data_test_raw,usecols=['customerID'])
df_test_raw1.head(2)


ser_test_ids = df_test_raw1['customerID']
target_name = 'Churn'


# check for nans
df_train.isna().sum().sum()

0


import pycaret
import pycaret.classification as pyc


df_train.shape

(5634, 39)


show_methods(pyc)


USE_GPU = False
if ENV_COLAB:
    USE_GPU = True
    
print(USE_GPU)

False


exp = pyc.setup(df_train,target_name,
                train_size=0.8,
                session_id=SEED,
                use_gpu=USE_GPU,
                preprocess = True,
                categorical_features = None,
                ordinal_features = None,
                high_cardinality_features = None,
                numeric_features = None,
                date_features = None,
                ignore_features = None,
                normalize = False,
                data_split_stratify = True,
                silent=True,
                profile=False,
                log_experiment=False,
                polynomial_features=True,
#                 fix_imbalance=True, # gives attribute error

                )

# use silent = True to check inferred datatypes
# then assign numeric and categorical features yourself.
#
# if sampling = False, 100% of data is used and its too slow
# if sampling = True, we need to enter number eg. 0.3 ourself.

"""
Here, we have data < 25k rows, so I have chosen not to use sampling.

""";


pyc.models(internal=True)[['Name', 'GPU Enabled']]

# google colab does not support cuml and thereby sklearn models
# we need to run blazingsql notebooks (not colab) to use cuml models.


pyc.get_metrics().index

Index(['acc', 'auc', 'recall', 'precision', 'f1', 'kappa', 'mcc'], dtype='object', name='ID')


# add Log Loss metric in pycaret
from sklearn.metrics import log_loss
pyc.add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False)

Name                                                        LogLoss
Display Name                                                LogLoss
Score Function                <function log_loss at 0x7fb620bd0e50>
Scorer               make_scorer(log_loss, greater_is_better=False)
Target                                                         pred
Args                                                             {}
Greater is Better                                             False
Multiclass                                                     True
Custom                                                         True
Name: logloss, dtype: object


best = pyc.compare_models(sort = 'AUC',fold=5)


model_name = 'lda'
path_df_eval = 'pycaret_df_eval_lda.csv'
model = pyc.create_model(model_name,verbose=False)

mean_row = pyc.pull().loc['Mean'].values
df_eval = compare_new_models(model_name,'default',mean_row,
                             path_df_eval,sort='AUC',df_eval=None)


%%time

# if model takes long time, comment this and save model later (keep it joblib.keep)
# takes long time: xgb lda
# takes short time: lr nb

model_tuned2 = pyc.tune_model(model,n_iter=500,search_library='optuna',
                              fold=5,optimize='AUC',verbose=False)
mean_row = pyc.pull().loc['Mean']

desc = 'tuned,optuna,n_iter=500'
df_eval = compare_new_models(model_name,desc,
        mean_row,path_df_eval,sort='AUC',df_eval=df_eval)

CPU times: user 2min 57s, sys: 27.9 s, total: 3min 25s
Wall time: 1min 41s


%%time

# if model takes long time, comment this and save model later (keep it joblib.keep)
# takes long time: xgb lda
# takes short time: lr nb

model_tuned_F1 = pyc.tune_model(model,n_iter=500,
                                search_library='optuna',
                               fold=5,optimize='F1',verbose=False)
mean_row = pyc.pull().loc['Mean']

desc = 'tuned,optuna,n_iter=500,optimize=F1'
df_eval = compare_new_models(model_name,desc,
            mean_row,path_df_eval,sort='AUC',df_eval=df_eval)

# Wall time: 4min 36s

CPU times: user 3min 16s, sys: 17.3 s, total: 3min 33s
Wall time: 2min 9s


# look at df_eval and find best model


model_best = model_tuned2
model_best_F1 = model_tuned_F1

odir = '.' if ENV_COLAB else '../models/'
path_model_best = odir + 'pycaret_model_best_' + model_name + '.joblib'
path_model_best_F1 = odir+'pycaret_model_best_F1_' + model_name + '.joblib'

joblib.dump(model_best, path_model_best)
joblib.dump(model_best_F1, path_model_best_F1)


model_best = joblib.load(path_model_best)
model_best_F1 = joblib.load(path_model_best_F1)

# after selecting best model, delete unwanted models
import gc

try: del model_tuned1
except: pass


gc.collect()

148


# AUC-ROC plot
pyc.plot_model(model_best, plot = 'auc')


# confusion matrix
pyc.plot_model(model_best, plot = 'confusion_matrix')


# evaluate model (click on buttons)
pyc.evaluate_model(model_best)


# pyc.finalize_model?


model_final = pyc.finalize_model(model_best)
print(model_final)

LinearDiscriminantAnalysis(n_components=None, priors=None,
                           shrinkage=0.0001015188180155063, solver='eigen',
                           store_covariance=False, tol=0.0001)


df_preds = pyc.predict_model(model_final,df_test)
df_preds.iloc[-5:,-5:]


ytest = df_preds[target_name].to_numpy().ravel()
yprobs = df_preds['Score'].to_numpy().ravel()
ypreds = df_preds['Label'].to_numpy().ravel()

yprobs2d = np.c_[1-yprobs,yprobs]


pred_name = 'pycaret_lda'
path_pred = f'../predictions/{pred_name}.csv'

df_preds_out = pd.DataFrame({'customerID': ser_test_ids})
df_preds_out[f'ypreds_{pred_name}'] = ypreds
df_preds_out[f'yprobs_{pred_name}'] = yprobs2d[:,1]

df_preds_out.to_csv(path_pred,index=False)

df_preds_out.head()


def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
    skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
    skpmetrics.plot_confusion_matrix(ytest,ypreds)

model_eval_bin('pycaret_'+model_name,ytest,ypreds,yprobs2d,show_plots=True)

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1035
           1       0.61      0.51      0.56       374

    accuracy                           0.78      1409
   macro avg       0.72      0.70      0.71      1409
weighted avg       0.77      0.78      0.78      1409

[[910 125]
 [182 192]]


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 10 min 2 secs

	Gender	Partner	Dependents	Tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Contract_Month-to-month	NoSeniorCitizen_Contract_Month-to-month	InternetService_Fiber optic	StreamingTV_NoInternetService	No_OB_DP_TS	TotalServices	SenCit_Dependents	Partner_Dependents	SenCit_Partner	SenCit_Contract	SenCit_TechSupport	SenCit_PayMeth	Contract_mean_totCharges	Contract_totCharges_diff	PayMeth_mean_monthCharges	PayMeth_monthCharges_diff	Tenure_cat
0	0	1	0	36	1	2	1	2	2	2	2	0	2	2	1	1	106.05	3834.40	1	0	0	1	1	2	0	1	1	2	2	1	3683.643192	150.756808	66.703657	39.346343	3
1	1	0	0	10	1	0	0	2	0	0	2	2	0	0	0	1	62.25	612.95	0	1	1	1	1	1	0	0	0	0	2	1	1370.923131	-757.973131	66.703657	-4.453657	0
5632	0	1	1	68	1	2	1	0	2	0	2	2	2	2	1	1	103.75	7039.45	1	0	0	1	1	2	1	2	1	2	2	1	3683.643192	3355.806808	66.703657	37.046343	5
5633	1	0	0	69	1	2	2	1	1	1	1	1	1	2	0	1	23.95	1713.10	1	0	1	0	1	7	0	0	0	2	1	1	3683.643192	-1970.543192	66.703657	-42.753657	5

	0	1	2	3
0	Any	calibrate_model	interpret_model	pull
1	Dict	compare_models	io	pycaret
2	Display	create_model	is_in_colab	remove_metric
3	List	deploy_model	load_config	save_config
4	MLUsecase	enable_colab	load_model	save_model
5	Optional	ensemble_model	models	set_config
6	Tuple	evaluate_model	np	setup
7	Union	finalize_model	optimize_threshold	stack_models
8	add_metric	get_config	pd	traceback
9	automl	get_logs	plot_model	tune_model
10	blend_models	get_metrics	predict_model	warnings

	Description	Value
0	session_id	100
1	Target	Churn
2	Target Type	Binary
3	Label Encoded	0: 0, 1: 1
4	Original Data	(5634, 39)
5	Missing Values	False
6	Numeric Features	6
7	Categorical Features	32
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(4507, 58)
12	Transformed Test Set	(1127, 58)
13	Shuffle Train-Test	True
14	Stratify Train-Test	True
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	False
19	Log Experiment	False
20	Experiment Name	clf-default-name
21	USI	07bf
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	mean
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	constant
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	False
30	Normalize Method	None
31	Transformation	False
32	Transformation Method	None
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Clustering	False
45	Clustering Iteration	None
46	Polynomial Features	True
47	Polynomial Degree	2
48	Trignometry Features	False
49	Polynomial Threshold	0.100000
50	Group Features	False
51	Feature Selection	False
52	Features Selection Threshold	None
53	Feature Interaction	False
54	Feature Ratio	False
55	Interaction Threshold	None
56	Fix Imbalance	False
57	Fix Imbalance Method	SMOTE

	Name	GPU Enabled
ID
lr	Logistic Regression	False
knn	K Neighbors Classifier	False
nb	Naive Bayes	False
dt	Decision Tree Classifier	False
svm	SVM - Linear Kernel	False
rbfsvm	SVM - Radial Kernel	False
gpc	Gaussian Process Classifier	False
mlp	MLP Classifier	False
ridge	Ridge Classifier	False
rf	Random Forest Classifier	False
qda	Quadratic Discriminant Analysis	False
ada	Ada Boost Classifier	False
gbc	Gradient Boosting Classifier	False
lda	Linear Discriminant Analysis	False
et	Extra Trees Classifier	False
xgboost	Extreme Gradient Boosting	False
lightgbm	Light Gradient Boosting Machine	False
catboost	CatBoost Classifier	False
Bagging	Bagging Classifier	False
Stacking	Stacking Classifier	False
Voting	Voting Classifier	False
CalibratedCV	Calibrated Classifier CV	False

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	LogLoss	TT (Sec)
gbc	Gradient Boosting Classifier	0.8103	0.8474	0.5377	0.6816	0.6009	0.4787	0.4847	6.5520	0.4120
lr	Logistic Regression	0.8083	0.8465	0.5552	0.6691	0.6061	0.4809	0.4851	6.6210	0.6320
ada	Ada Boost Classifier	0.8141	0.8462	0.5577	0.6838	0.6142	0.4934	0.4980	6.4218	0.1400
lda	Linear Discriminant Analysis	0.8110	0.8458	0.5552	0.6762	0.6095	0.4864	0.4907	6.5291	0.0320
catboost	CatBoost Classifier	0.8003	0.8451	0.5268	0.6553	0.5836	0.4543	0.4593	6.8969	3.5660
lightgbm	Light Gradient Boosting Machine	0.7941	0.8326	0.5301	0.6366	0.5776	0.4431	0.4469	7.1115	0.6360
xgboost	Extreme Gradient Boosting	0.7870	0.8268	0.5193	0.6179	0.5641	0.4246	0.4275	7.3568	0.9380
rf	Random Forest Classifier	0.7932	0.8265	0.4958	0.6442	0.5602	0.4280	0.4344	7.1420	0.2680
nb	Naive Bayes	0.7131	0.8232	0.8244	0.4782	0.6048	0.4044	0.4414	9.9088	0.0200
et	Extra Trees Classifier	0.7772	0.8053	0.5025	0.5962	0.5451	0.3990	0.4018	7.6938	0.2600
knn	K Neighbors Classifier	0.7737	0.7819	0.4975	0.5902	0.5389	0.3905	0.3937	7.8165	0.0400
dt	Decision Tree Classifier	0.7444	0.6836	0.5485	0.5177	0.5325	0.3569	0.3572	8.8281	0.0280
qda	Quadratic Discriminant Analysis	0.6772	0.5962	0.4238	0.3970	0.4066	0.1870	0.1884	11.1502	0.0200
svm	SVM - Linear Kernel	0.6716	0.0000	0.6169	0.4371	0.4613	0.2623	0.3013	11.3422	0.0400
ridge	Ridge Classifier	0.8103	0.0000	0.5084	0.6972	0.5877	0.4683	0.4784	6.5521	0.0180

Modelling Customer Churn using pycaret

Load the libraries

Colab¶

Useful Scripts

Load the Data

Data Processing

Modelling Pycaret

Pycaret Setup

Comparing Models

Create Models

Hyperparameter Tuning

Save Model After HPO

Model Evaluation (Validation) : plot_model and evaluate_model

Finalize model (Fit whole train data)

Model Evaluation on Test Data

Time Taken

	Model	Description	Accuracy	AUC	Recall	Precision	F1	Kappa	MCC	LogLoss
0	lda	default	0.810300	0.845700	0.552700	0.675200	0.607200	0.483900	0.488500	6.552000
1	lda	tuned,optuna,n_iter=500	0.793700	0.823400	0.517600	0.638000	0.571200	0.437300	0.441700	7.126800

	PayMeth_monthCharges_diff	Tenure_cat	Label	Score
1404	16.585135	0	1	0.6637
1405	-19.764865	3	0	0.9429
1406	6.085135	1	0	0.5546
1407	30.735135	1	1	0.7503
1408	-2.787658	1	0	0.6080

	customerID	ypreds_pycaret_lda	yprobs_pycaret_lda
0	1794-HBQTJ	0	0.6444
1	0356-OBMAC	0	0.9264
2	4077-CROMM	1	0.6536
3	5442-PPTJY	0	0.9534
4	2333-KWEWW	0	0.9574