import time

time_start_notebook = time.time()


%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot

    # Special
    !pip install featuretools[complete]
    !pip install composeml
    !pip install evalml

    print('Environment: Google Colab')


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as skmetrics

# special
import imblearn

# featuretools
import composeml as cp
import featuretools as ft
import evalml

# warnings
import warnings

# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

%matplotlib inline
%load_ext watermark
%watermark -iv

2020-12-20 14:35:04,517 featuretools - WARNING    Featuretools failed to load plugin nlp_primitives from library nlp_primitives. For a full stack trace, set logging to debug.
joblib        : 1.0.0
imblearn      : 0.7.0
sys           : 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
json          : 2.0.9
plotly_express: 0.4.1
evalml        : 0.16.1
seaborn       : 0.11.0
composeml     : 0.5.1
featuretools  : 0.22.0
autopep8      : 1.5.4
matplotlib    : 3.3.3
numpy         : 1.19.4
sklearn       : 0.23.2
pandas        : 1.1.5


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'

if ENV_COLAB:
    path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
    path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv.csv'


df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)

print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))

(5634, 21)
(1409, 21)


target_name = 'Churn'


import plotly_express as px


px.histogram(df_train, x=target_name,height=300,width=300)


px.histogram(df_train, x='gender', color=target_name,width=300,height=200)


df_train['customerID'].nunique() == len(df_train)

True


def clean_data(dfx):
    dfx = dfx.copy()
    
    # keep customerid for index feature.
    # from eda we see that gender has no effect
    cols_drop = ['gender']
    dfx = dfx.drop(cols_drop,axis=1)

    # impute 
    dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],
                                        errors='coerce').fillna(0)

    return dfx


df_train = clean_data(df_train)
df_test  = clean_data(df_test)


df_Xtrain  = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})

df_Xtest   = df_test.drop(target_name,axis=1)
ser_ytest  = df_test[target_name].map({'No':0, 'Yes':1})


import evalml
from evalml.objectives import CostBenefitMatrix
from evalml import AutoMLSearch


show_methods(evalml)


show_methods(evalml.objectives)


from evalml.objectives import CostBenefitMatrix
cost_benefit_matrix = CostBenefitMatrix(true_positive=400,
                                        true_negative=0,
                                        false_positive=-100,
                                        false_negative=-200)


from evalml import AutoMLSearch


%%time

import warnings
warnings.simplefilter('ignore')

automl = AutoMLSearch(problem_type='binary', objective='log loss binary')
automl.search(df_Xtrain, ser_ytrain)

pipe_ll = automl.best_pipeline
pipe_ll.fit(df_Xtrain, ser_ytrain)

Using default limit of max_batches=1.

`X` passed was not a DataTable. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead.
`y` passed was not a DataColumn. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead.
Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Binary. 
Lower score is better.

Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: xgboost, linear_model, random_forest, extra_trees, decision_tree, catboost, lightgbm

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 9.165
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.579
Batch 1: (3/9) LightGBM Classifier w/ Imputer + Text... Elapsed:00:38
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.440
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + T... Elapsed:01:13
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.430
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + T... Elapsed:01:46
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.579
Batch 1: (6/9) CatBoost Classifier w/ Imputer + Text... Elapsed:02:18
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.550
Batch 1: (7/9) XGBoost Classifier w/ Imputer + Text ... Elapsed:02:50
	Starting cross validation
[14:38:02] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[14:38:13] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[14:38:25] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
	Finished cross validation - mean Log Loss Binary: 0.431
Batch 1: (8/9) Random Forest Classifier w/ Imputer +... Elapsed:03:25
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.421
Batch 1: (9/9) Logistic Regression Classifier w/ Imp... Elapsed:04:01
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.415

Search finished after 04:45            
Best pipeline: Logistic Regression Classifier w/ Imputer + Text Featurization Component + One Hot Encoder + Standard Scaler
Best pipeline Log Loss Binary: 0.415499

GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Text Featurization Component':{'text_columns': ['Contract', 'PaymentMethod']}, 'One Hot Encoder':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': None, 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Logistic Regression Classifier':{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'},})


pipe_score_ll = pipe_ll.score(df_Xtest, ser_ytest, ['log loss binary'])
print (pipe_score_ll)

pipe_score_ll_cbm = pipe_ll.score(df_Xtest, ser_ytest, [cost_benefit_matrix])
print (pipe_score_ll_cbm)

# Calculate total profit across all customers using
# pipeline optimized for Log Loss
total_profit_ll = pipe_score_ll_cbm['Cost Benefit Matrix'] * len(df_Xtrain)
print (f"{total_profit_ll:,.2f}")

OrderedDict([('Log Loss Binary', 0.42941442097922433)])
OrderedDict([('Cost Benefit Matrix', 26.685592618878637)])
150,346.63


%%time
import warnings

warnings.simplefilter('ignore')
automl = AutoMLSearch(problem_type='binary', objective=cost_benefit_matrix)
automl.search(df_Xtrain, ser_ytrain)

pipe_cbm = automl.best_pipeline
pipe_cbm.fit(df_Xtrain, ser_ytrain)

Using default limit of max_batches=1.

`X` passed was not a DataTable. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead.
`y` passed was not a DataColumn. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead.
Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Cost Benefit Matrix. 
Greater score is better.

Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: xgboost, linear_model, random_forest, extra_trees, decision_tree, catboost, lightgbm

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Cost Benefit Matrix: -53.071
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Cost Benefit Matrix: 17.678
High coefficient of variation (cv >= 0.2) within cross validation scores. Decision Tree Classifier w/ Imputer + Text Featurization Component + One Hot Encoder may not perform as estimated on unseen data.
Batch 1: (3/9) LightGBM Classifier w/ Imputer + Text... Elapsed:00:36
	Starting cross validation
	Finished cross validation - mean Cost Benefit Matrix: 22.009
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + T... Elapsed:01:17
	Starting cross validation
	Finished cross validation - mean Cost Benefit Matrix: 15.602
High coefficient of variation (cv >= 0.2) within cross validation scores. Extra Trees Classifier w/ Imputer + Text Featurization Component + One Hot Encoder may not perform as estimated on unseen data.
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + T... Elapsed:01:58
	Starting cross validation
	Finished cross validation - mean Cost Benefit Matrix: -53.071
Batch 1: (6/9) CatBoost Classifier w/ Imputer + Text... Elapsed:02:30
	Starting cross validation
	Finished cross validation - mean Cost Benefit Matrix: 10.490
High coefficient of variation (cv >= 0.2) within cross validation scores. CatBoost Classifier w/ Imputer + Text Featurization Component may not perform as estimated on unseen data.
Batch 1: (7/9) XGBoost Classifier w/ Imputer + Text ... Elapsed:03:08
	Starting cross validation
[14:45:28] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[14:45:41] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[14:45:53] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
	Finished cross validation - mean Cost Benefit Matrix: 24.299
High coefficient of variation (cv >= 0.2) within cross validation scores. XGBoost Classifier w/ Imputer + Text Featurization Component + One Hot Encoder may not perform as estimated on unseen data.
Batch 1: (8/9) Random Forest Classifier w/ Imputer +... Elapsed:03:46
	Starting cross validation
	Finished cross validation - mean Cost Benefit Matrix: 18.229
Batch 1: (9/9) Logistic Regression Classifier w/ Imp... Elapsed:04:25
	Starting cross validation
	Finished cross validation - mean Cost Benefit Matrix: 26.926

Search finished after 05:04            
Best pipeline: Logistic Regression Classifier w/ Imputer + Text Featurization Component + One Hot Encoder + Standard Scaler
Best pipeline Cost Benefit Matrix: 26.925808
CPU times: user 4min 20s, sys: 48.6 s, total: 5min 8s
Wall time: 5min 11s

GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Text Featurization Component':{'text_columns': ['Contract', 'PaymentMethod']}, 'One Hot Encoder':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': None, 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Logistic Regression Classifier':{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'},})


pipe_score_cbm = pipe_cbm.score(df_Xtest, ser_ytest, [cost_benefit_matrix])
print (pipe_score_cbm)

# Calculate total profit across all customers using
# pipeline optimized for Log Loss
total_profit_cbm = pipe_score_cbm['Cost Benefit Matrix'] * len(df_Xtrain)
print (f"{total_profit_cbm:,.2f}")
print (f"{total_profit_ll:,.2f}")

OrderedDict([('Cost Benefit Matrix', 26.685592618878637)])
150,346.63
150,346.63


pipe_cbm

GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Text Featurization Component':{'text_columns': ['Contract', 'PaymentMethod']}, 'One Hot Encoder':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': None, 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Logistic Regression Classifier':{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'},})


show_methods(evalml.model_understanding)


from evalml.model_understanding import graphs as evgraphs
show_methods(evgraphs)


show_methods(pipe_cbm)


ytest = np.array(ser_ytest).flatten()
ypreds = np.array(pipe_cbm.predict(df_Xtest)).flatten()
yprobs2d = pipe_cbm.predict_proba(df_Xtest).to_numpy()
yprobs = yprobs2d[:,1]


evgraphs.graph_confusion_matrix(ytest, ypreds)


evgraphs.graph_roc_curve(ytest,yprobs)


evgraphs.graph_precision_recall_curve(ytest,yprobs)


show_methods(evgraphs,contains='graph')


evgraphs.graph_partial_dependence(pipe_cbm,df_Xtest,feature='SeniorCitizen')


N = 100
evgraphs.graph_binary_objective_vs_threshold(
    pipe_cbm,df_Xtest.iloc[:N],ytest[:N],objective=cost_benefit_matrix)


evgraphs.graph_permutation_importance(pipe_cbm,df_Xtest.iloc[:N],ytest[:N],
                                      objective=cost_benefit_matrix)


import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics


def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    if show_plots:
        skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
        skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
        skpmetrics.plot_confusion_matrix(ytest,ypreds)

model_eval_bin('evalml',ytest,ypreds,yprobs2d,show_plots=True)

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.64      0.55      0.59       374

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

[[917 118]
 [167 207]]


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 14 min 8 secs

	0	1	2	3
0	AUC	F1Weighted	Precision	cost_benefit_matrix
1	AUCMacro	FraudCost	PrecisionMacro	fraud_cost
2	AUCMicro	LeadScoring	PrecisionMicro	get_all_objective_names
3	AUCWeighted	LogLossBinary	PrecisionWeighted	get_core_objective_names
4	AccuracyBinary	LogLossMulticlass	R2	get_core_objectives
5	AccuracyMulticlass	MAE	Recall	get_non_core_objectives
6	BalancedAccuracyBinary	MCCBinary	RecallMacro	get_objective
7	BalancedAccuracyMulticlass	MCCMulticlass	RecallMicro	lead_scoring
8	BinaryClassificationObjective	MSE	RecallWeighted	multiclass_classification_objective
9	CostBenefitMatrix	MaxError	RegressionObjective	objective_base
10	ExpVariance	MeanSquaredLogError	RootMeanSquaredError	regression_objective
11	F1	MedianAE	RootMeanSquaredLogError	standard_metrics
12	F1Macro	MulticlassClassificationObjective	binary_classification_objective	utils
13	F1Micro	ObjectiveBase

	0	1	2	3
0	LabelBinarizer	get_objective	import_or_raise	sk_partial_dependence
1	ModelFamily	graph_binary_objective_vs_threshold	jupyter_check	sk_permutation_importance
2	NullsInColumnWarning	graph_confusion_matrix	normalize_confusion_matrix	sklearn_auc
3	ProblemTypes	graph_partial_dependence	np	sklearn_confusion_matrix
4	binary_objective_vs_threshold	graph_permutation_importance	partial_dependence	sklearn_precision_recall_curve
5	calculate_permutation_importance	graph_precision_recall_curve	pd	sklearn_roc_curve
6	confusion_matrix	graph_prediction_vs_actual	precision_recall_curve	unique_labels
7	copy	graph_roc_curve	roc_curve	warnings
8	evalml

	0	1	2	3
0	classes_	describe	hyperparameters	predict_proba
1	clone	estimator	input_feature_names	problem_type
2	component_graph	feature_importance	load	random_state
3	compute_estimator_features	fit	model_family	save
4	custom_hyperparameters	get_component	name	score
5	custom_name	graph	parameters	summary
6	default_parameters	graph_feature_importance	predict	threshold

Modelling Customer Churn

Load the libraries

Colab¶

Useful Scripts

Load the Data

Data Processing

Modelling: evalML

Select the Objective

AutoML Search

Default Objective

Ojbective: Cost Benefit Matrix

Model Evaluation

Time Taken

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	1621-YNCJH	Female	Yes	No	36	Yes	Yes	Fiber optic	Yes	Yes	Yes	Yes	No	Yes	Two year	Yes	Credit card (automatic)	106.05	3834.4	No
1	7143-BQIBA	Male	No	No	10	Yes	No	DSL	Yes	No	No	Yes	Yes	No	Month-to-month	No	Bank transfer (automatic)	62.25	612.95	No
5632	0862-PRCBS	Female	Yes	Yes	68	Yes	Yes	Fiber optic	No	Yes	No	Yes	Yes	Yes	Two year	Yes	Credit card (automatic)	103.75	7039.45	No
5633	4656-CAURT	Male	No	No	69	Yes	Yes	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Bank transfer (automatic)	23.95	1713.1	No

	0	1	2	3
0	AutoMLSearch	exceptions	preprocessing	skopt
1	automl	model_family	print_info	tuners
2	data_checks	model_understanding	problem_types	utils
3	demos	objectives	sklearn	warnings
4	evalml	pipelines

	0	1	2	3
0	binary_objective_vs_threshold	explain_predictions_best_worst	graph_precision_recall_curve	partial_dependence
1	calculate_permutation_importance	graph_binary_objective_vs_threshold	graph_prediction_vs_actual	precision_recall_curve
2	confusion_matrix	graph_confusion_matrix	graph_roc_curve	prediction_explanations
3	explain_prediction	graph_partial_dependence	graphs	roc_curve
4	explain_predictions	graph_permutation_importance	normalize_confusion_matrix