import time

time_start_notebook = time.time()


%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot

    # Special
    !pip install featuretools[complete]

    # HPO
    !git clone https://github.com/thuijskens/scikit-hyperband.git
    sys.path.append('scikit-hyperband/hyperband')

    # update modules
    !pip uninstall xgboost
    !pip install -U xgboost

    print('Environment: Google Colab')


from hyperband_search import HyperbandSearchCV


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import joblib
from tqdm import tqdm_notebook as tqdm

import plotly_express as px

from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as skmetrics

# special
import imblearn
import featuretools as ft

# warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning

warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

%load_ext watermark
%watermark -iv

<frozen importlib._bootstrap>:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
<frozen importlib._bootstrap>:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject

2020-12-23 18:52:18,691 featuretools - WARNING    Featuretools failed to load plugin nlp_primitives from library nlp_primitives. For a full stack trace, set logging to debug.
numpy         : 1.19.4
pandas        : 1.1.5
seaborn       : 0.11.0
sklearn       : 0.23.2
joblib        : 1.0.0
imblearn      : 0.7.0
autopep8      : 1.5.4
featuretools  : 0.22.0
sys           : 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
plotly_express: 0.4.1
json          : 2.0.9
matplotlib    : 3.3.3


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'

if ENV_COLAB:
    path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
    path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv.csv'


df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)

print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))

(5634, 21)
(1409, 21)


ser_test_ids = df_test['customerID']


target_name = 'Churn'


import plotly_express as px


px.histogram(df_train, x=target_name,height=300,width=300)


px.histogram(df_train, x='gender', color=target_name,width=300,height=200)


df_train['customerID'].nunique() == len(df_train)

True


def clean_data(dfx):
    dfx = dfx.copy()
    
    # keep customerid for index feature.
    # from eda we see that gender has no effect
    cols_drop = ['gender']
    dfx = dfx.drop(cols_drop,axis=1)

    # impute 
    dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],
                                        errors='coerce').fillna(0)

    return dfx


df_train = clean_data(df_train)
df_test  = clean_data(df_test)


df_Xtrain  = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})

df_Xtest   = df_test.drop(target_name,axis=1)
ser_ytest  = df_test[target_name].map({'No':0, 'Yes':1})


from featuretools import variable_types as vtypes

show_methods(vtypes)


df_Xtrain.head(2)


# df_train[cols_obj].apply(lambda x: pd.Series.unique(x))


cols_obj = df_train.select_dtypes('object').columns.tolist()
df_train[cols_obj].apply(lambda x: pd.Series.nunique(x)).sort_values()

Partner                2
Dependents             2
PhoneService           2
PaperlessBilling       2
Churn                  2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaymentMethod          4
customerID          5634
dtype: int64


# customer id is index
# other columns have very low cardinality, we can take ordinal or ohe.


cols_cat = [i for i in cols_obj if i not in ['customerID',target_name]]
print(cols_cat)

['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


cols_num = df_train.select_dtypes('number').columns.tolist()
cols_num

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


cols_num = [i for i in cols_num if i not in ['SeniorCitizen']]
cols_cat += ['SeniorCitizen']


features = cols_cat + cols_num


def get_fm(dfx,cols_num=cols_num,cols_cat=cols_cat,
          index='customerID'
          ):
    dic_cat = {i:vtypes.Categorical for i in cols_cat}
    dic_num = {i:vtypes.Numeric for i in cols_num}

    all_variable_types = {**dic_cat, **dic_num}

    
    es = ft.EntitySet("data")
    es.entity_from_dataframe(entity_id="data",
                             dataframe=dfx,
                             index=index,
                             time_index=None,
                             variable_types=all_variable_types)
    new_entity_id="SeniorCitizen"
    es.normalize_entity(base_entity_id="data",
                        new_entity_id=new_entity_id,
                        index=new_entity_id
                       )

# Adding This gave me worse result.
#     new_entity_id="Dependents"
#     es.normalize_entity(base_entity_id="data",
#                         new_entity_id=new_entity_id,
#                         index=new_entity_id
#                        )

    trans_primitives = [
        'divide_numeric', # cross multiply all numeric features (not others)
        ]

    feature_matrix, features = ft.dfs(entityset=es,
                                      target_entity="data",
                                      trans_primitives=trans_primitives,
                                      drop_exact=[],
                                      verbose=True
                                      )

    df_out = feature_matrix
    cols_cat = list(df_out.select_dtypes('object').columns)
    df_out = pd.get_dummies(df_out,columns=cols_cat,drop_first=False)

    return df_out


df_Xtrain_new = get_fm(df_Xtrain)

df_Xtrain_new.head(2)

Built 71 features
Elapsed: 00:00 | Progress: 100%|██████████


df_Xtest_new = get_fm(df_Xtest)

df_Xtest_new.head(2)

Built 71 features
Elapsed: 00:00 | Progress: 100%|██████████


def post_process_fm(fm,thr_miss=0.95,thr_corr=0.95):
    """Post process feature matrix.
    1. remove duplicated features
    2. remove features having many missing features
    3. remvoe zero variance features
    4. remove high collinear features

    """
    # Remove duplicated features
    start_features = fm.shape[1]
    fm = fm.iloc[:, ~fm.columns.duplicated()]
    n_dups = start_features - fm.shape[1]
    print(f'There were {n_dups} duplicated features.')
    fm = fm.replace({np.inf: np.nan, -np.inf:np.nan})

    # Remove the ids and labels
    idname = 'index'
    targetname = 'price'
    cols_drop_id = [ i for i in fm.columns if idname in i]
    cols_drop_target = [ i for i in fm.columns if targetname in i]
    cols_drop_id_target = cols_drop_id + cols_drop_target

    print('Dropping ids and label: ', cols_drop_id_target)
    fm = fm.drop(cols_drop_id_target,axis=1)

    # One hot encoding (if necessary)
    fm = pd.get_dummies(fm)
    n_features_start = fm.shape[1]
    print('Original shape: ', fm.shape)
    
    # Find missing and percentage
    df_miss = pd.DataFrame(fm.isnull().sum())
    df_miss['frac'] = df_miss[0] / fm.shape[0]
    df_miss.sort_values('frac', ascending = False, inplace = True)

    # Missing above threshold
    cols_miss = list(df_miss[df_miss['frac'] > thr_miss].index)
    n_cols_miss = len(cols_miss)

    # Remove missing columns
    fm = fm[[i for i in fm if i not in cols_miss]]
    print('{} missing columns with threshold: {}.'.format(
        n_cols_miss, thr_miss))
    
    # Zero variance
    df_unq_ct = pd.DataFrame(fm.nunique()).sort_values(0,ascending=True)
    cols_zero_var = list(df_unq_ct[df_unq_ct[0] == 1].index)
    n_cols_zero_var = len(cols_zero_var)

    # Remove zero variance columns
    fm = fm[[i for i in fm if i not in cols_zero_var]]
    print('{} zero variance columns.'.format(n_cols_zero_var))
    
    # Correlations
    df_corr = fm.corr()

    # Extract the upper triangle of the correlation matrix
    df_upper = df_corr.where(np.triu(np.ones(df_corr.shape), k = 1).astype(np.bool))

    # Select the features with correlations above the threshold
    # Need to use the absolute value
    cols_drop = [col for col in df_upper.columns 
                    if any(df_upper[col].abs() > thr_corr)]

    n_collinear = len(cols_drop)

    fm = fm[[i for i in fm if i not in cols_drop]]
    print('{} collinear columns removed with correlation above {}.'.format(
        n_collinear,  thr_corr))

    n_total_cols_removed = n_dups + n_cols_miss + n_cols_zero_var + n_collinear

    print('Total columns removed: ', n_total_cols_removed)
    print('Shape after feature selection: {}.'.format(fm.shape))

    return fm


df_Xtrain_good = post_process_fm(df_Xtrain_new,thr_miss=0.9,thr_corr=0.9)
df_Xtest_good = post_process_fm(df_Xtest_new,thr_miss=0.9,thr_corr=0.9)

There were 0 duplicated features.
Dropping ids and label:  []
Original shape:  (5634, 99)
0 missing columns with threshold: 0.9.
26 zero variance columns.
39 collinear columns removed with correlation above 0.9.
Total columns removed:  65
Shape after feature selection: (5634, 34).
There were 0 duplicated features.
Dropping ids and label:  []
Original shape:  (1409, 100)
0 missing columns with threshold: 0.9.
25 zero variance columns.
41 collinear columns removed with correlation above 0.9.
Total columns removed:  66
Shape after feature selection: (1409, 34).


# if some features are not common in train test, exclude them.
cols_exclude = np.setdiff1d(df_Xtest_good.columns,df_Xtest_good.columns)
cols_exclude

array([], dtype=object)


Xtr = df_Xtrain_good.fillna(0)
ytr = np.array(ser_ytrain)

Xtx = df_Xtest_good.fillna(0)
ytx = np.array(ser_ytest)


# Gives Worse result
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(Xtr)

# Xtr = scaler.transform(Xtr)
# Xtx = scaler.transform(Xtx)


from xgboost import XGBClassifier
model = XGBClassifier(random_state=SEED,subsample=0.9,max_depth=3)

model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
skmetrics.confusion_matrix(ytx, ypreds)

[18:52:21] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

/Users/poudel/opt/miniconda3/envs/ft/lib/python3.8/site-packages/xgboost/sklearn.py:888: UserWarning:

The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].

array([[921, 114],
       [169, 205]])


# XGBClassifier?


from sklearn.linear_model import LogisticRegression


params_fixed = {'dual': False, 
                'random_state': SEED,
                'n_jobs': 1
               }

params_best = {'C': 0.42679058013626753, 'max_iter': 1000,
               'penalty': 'l2', 'solver': 'lbfgs'}

params = params_fixed
params.update(params_best)

model = LogisticRegression(**params)

model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
skmetrics.confusion_matrix(ytx, ypreds)

array([[925, 110],
       [168, 206]])


ytest     = np.array(ser_ytest)
yprobs2d = model.predict_proba(Xtx)


pred_name = 'featuretools_lr'
path_pred = f'../predictions/{pred_name}.csv'

df_preds_out = pd.DataFrame({'customerID': ser_test_ids})
df_preds_out[f'ypreds_{pred_name}'] = ypreds
df_preds_out[f'yprobs_{pred_name}'] = yprobs2d[:,1]

df_preds_out.to_csv(path_pred,index=False)

df_preds_out.head()


def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
    skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
    skpmetrics.plot_confusion_matrix(ytest,ypreds)

model_eval_bin('LR',ytest,ypreds,yprobs2d,show_plots=True)

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.65      0.55      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409

[[925 110]
 [168 206]]


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 8 secs

	0	1	2	3
0	Boolean	FilePath	PandasTypes	api
1	Categorical	FullName	PhoneNumber	camel_to_snake
2	ClassNameDescriptor	IPAddress	SubRegionCode	find_variable_types
3	CountryCode	Id	Text	graph_variable_types
4	DEFAULT_DTYPE_VALUES	Index	TimeIndex	list_variable_types
5	DateOfBirth	LatLong	Timedelta	np
6	Datetime	NaturalLanguage	URL	pd
7	DatetimeTimeIndex	Numeric	Unknown	utils
8	Discrete	NumericTimeIndex	Variable	variable
9	EmailAddress	Ordinal	ZIPCode	warnings

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges	MonthlyCharges / TotalCharges	MonthlyCharges / tenure	TotalCharges / MonthlyCharges	TotalCharges / tenure	tenure / MonthlyCharges	tenure / TotalCharges	SeniorCitizen.COUNT(data)	SeniorCitizen.MAX(data.MonthlyCharges)	SeniorCitizen.MAX(data.TotalCharges)	SeniorCitizen.MAX(data.tenure)	SeniorCitizen.MEAN(data.MonthlyCharges)	SeniorCitizen.MEAN(data.TotalCharges)	SeniorCitizen.MEAN(data.tenure)	SeniorCitizen.MIN(data.MonthlyCharges)	SeniorCitizen.MIN(data.TotalCharges)	SeniorCitizen.MIN(data.tenure)	SeniorCitizen.NUM_UNIQUE(data.Contract)	SeniorCitizen.NUM_UNIQUE(data.Dependents)	SeniorCitizen.NUM_UNIQUE(data.DeviceProtection)	SeniorCitizen.NUM_UNIQUE(data.InternetService)	SeniorCitizen.NUM_UNIQUE(data.MultipleLines)	SeniorCitizen.NUM_UNIQUE(data.OnlineBackup)	SeniorCitizen.NUM_UNIQUE(data.OnlineSecurity)	SeniorCitizen.NUM_UNIQUE(data.PaperlessBilling)	SeniorCitizen.NUM_UNIQUE(data.Partner)	SeniorCitizen.NUM_UNIQUE(data.PaymentMethod)	SeniorCitizen.NUM_UNIQUE(data.PhoneService)	SeniorCitizen.NUM_UNIQUE(data.StreamingMovies)	SeniorCitizen.NUM_UNIQUE(data.StreamingTV)	SeniorCitizen.NUM_UNIQUE(data.TechSupport)	SeniorCitizen.SKEW(data.MonthlyCharges)	SeniorCitizen.SKEW(data.TotalCharges)	SeniorCitizen.SKEW(data.tenure)	SeniorCitizen.STD(data.MonthlyCharges)	SeniorCitizen.STD(data.TotalCharges)	SeniorCitizen.STD(data.tenure)	SeniorCitizen.SUM(data.MonthlyCharges)	SeniorCitizen.SUM(data.TotalCharges)	SeniorCitizen.SUM(data.tenure)	Partner_No	Partner_Yes	Dependents_No	Dependents_Yes	PhoneService_No	PhoneService_Yes	MultipleLines_No	MultipleLines_No phone service	MultipleLines_Yes	InternetService_DSL	InternetService_Fiber optic	InternetService_No	OnlineSecurity_No	OnlineSecurity_No internet service	OnlineSecurity_Yes	OnlineBackup_No	OnlineBackup_No internet service	OnlineBackup_Yes	DeviceProtection_No	DeviceProtection_No internet service	DeviceProtection_Yes	TechSupport_No	TechSupport_No internet service	TechSupport_Yes	StreamingTV_No	StreamingTV_No internet service	StreamingTV_Yes	StreamingMovies_No	StreamingMovies_No internet service	StreamingMovies_Yes	Contract_Month-to-month	Contract_One year	Contract_Two year	PaperlessBilling_No	PaperlessBilling_Yes	PaymentMethod_Bank transfer (automatic)	PaymentMethod_Credit card (automatic)	PaymentMethod_Electronic check	PaymentMethod_Mailed check	SeniorCitizen.MODE(data.Contract)_Month-to-month	SeniorCitizen.MODE(data.Dependents)_No	SeniorCitizen.MODE(data.DeviceProtection)_No	SeniorCitizen.MODE(data.InternetService)_Fiber optic	SeniorCitizen.MODE(data.MultipleLines)_No	SeniorCitizen.MODE(data.MultipleLines)_Yes	SeniorCitizen.MODE(data.OnlineBackup)_No	SeniorCitizen.MODE(data.OnlineSecurity)_No	SeniorCitizen.MODE(data.PaperlessBilling)_Yes	SeniorCitizen.MODE(data.Partner)_No	SeniorCitizen.MODE(data.PaymentMethod)_Electronic check	SeniorCitizen.MODE(data.PhoneService)_Yes	SeniorCitizen.MODE(data.StreamingMovies)_No	SeniorCitizen.MODE(data.StreamingMovies)_Yes	SeniorCitizen.MODE(data.StreamingTV)_No	SeniorCitizen.MODE(data.StreamingTV)_Yes	SeniorCitizen.MODE(data.TechSupport)_No
customerID
1621-YNCJH	0	36	106.05	3834.40	0.027658	2.945833	36.156530	106.511111	0.339463	0.009389	4743	118.75	8672.45	72	61.71498	2166.606304	32.078431	18.25	0.0	0	3	2	3	3	3	3	3	2	2	4	2	3	3	3	-0.088151	1.037776	0.255781	30.247656	2221.937336	24.622527	292714.15	10276213.7	152148	0	1	1	0	0	1	0	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	0	1	1	0	0	0	0	1	0	0	1	0	1	0	1	0	0	1	1	1	1	1	0	1	1	1	1	1	1	1	0	1	0	1
7143-BQIBA	0	10	62.25	612.95	0.101558	6.225000	9.846586	61.295000	0.160643	0.016315	4743	118.75	8672.45	72	61.71498	2166.606304	32.078431	18.25	0.0	0	3	2	3	3	3	3	3	2	2	4	2	3	3	3	-0.088151	1.037776	0.255781	30.247656	2221.937336	24.622527	292714.15	10276213.7	152148	1	0	1	0	0	1	1	0	0	1	0	0	0	0	1	1	0	0	1	0	0	0	0	1	0	0	1	1	0	0	1	0	0	1	0	1	0	0	0	1	1	1	1	1	0	1	1	1	1	1	1	1	0	1	0	1

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges	MonthlyCharges / TotalCharges	MonthlyCharges / tenure	TotalCharges / MonthlyCharges	TotalCharges / tenure	tenure / MonthlyCharges	tenure / TotalCharges	SeniorCitizen.COUNT(data)	SeniorCitizen.MAX(data.MonthlyCharges)	SeniorCitizen.MAX(data.TotalCharges)	SeniorCitizen.MAX(data.tenure)	SeniorCitizen.MEAN(data.MonthlyCharges)	SeniorCitizen.MEAN(data.TotalCharges)	SeniorCitizen.MEAN(data.tenure)	SeniorCitizen.MIN(data.MonthlyCharges)	SeniorCitizen.MIN(data.TotalCharges)	SeniorCitizen.MIN(data.tenure)	SeniorCitizen.NUM_UNIQUE(data.Contract)	SeniorCitizen.NUM_UNIQUE(data.Dependents)	SeniorCitizen.NUM_UNIQUE(data.DeviceProtection)	SeniorCitizen.NUM_UNIQUE(data.InternetService)	SeniorCitizen.NUM_UNIQUE(data.MultipleLines)	SeniorCitizen.NUM_UNIQUE(data.OnlineBackup)	SeniorCitizen.NUM_UNIQUE(data.OnlineSecurity)	SeniorCitizen.NUM_UNIQUE(data.PaperlessBilling)	SeniorCitizen.NUM_UNIQUE(data.Partner)	SeniorCitizen.NUM_UNIQUE(data.PaymentMethod)	SeniorCitizen.NUM_UNIQUE(data.PhoneService)	SeniorCitizen.NUM_UNIQUE(data.StreamingMovies)	SeniorCitizen.NUM_UNIQUE(data.StreamingTV)	SeniorCitizen.NUM_UNIQUE(data.TechSupport)	SeniorCitizen.SKEW(data.MonthlyCharges)	SeniorCitizen.SKEW(data.TotalCharges)	SeniorCitizen.SKEW(data.tenure)	SeniorCitizen.STD(data.MonthlyCharges)	SeniorCitizen.STD(data.TotalCharges)	SeniorCitizen.STD(data.tenure)	SeniorCitizen.SUM(data.MonthlyCharges)	SeniorCitizen.SUM(data.TotalCharges)	SeniorCitizen.SUM(data.tenure)	Partner_No	Partner_Yes	Dependents_No	Dependents_Yes	PhoneService_No	PhoneService_Yes	MultipleLines_No	MultipleLines_No phone service	MultipleLines_Yes	InternetService_DSL	InternetService_Fiber optic	InternetService_No	OnlineSecurity_No	OnlineSecurity_No internet service	OnlineSecurity_Yes	OnlineBackup_No	OnlineBackup_No internet service	OnlineBackup_Yes	DeviceProtection_No	DeviceProtection_No internet service	DeviceProtection_Yes	TechSupport_No	TechSupport_No internet service	TechSupport_Yes	StreamingTV_No	StreamingTV_No internet service	StreamingTV_Yes	StreamingMovies_No	StreamingMovies_No internet service	StreamingMovies_Yes	Contract_Month-to-month	Contract_One year	Contract_Two year	PaperlessBilling_No	PaperlessBilling_Yes	PaymentMethod_Bank transfer (automatic)	PaymentMethod_Credit card (automatic)	PaymentMethod_Electronic check	PaymentMethod_Mailed check	SeniorCitizen.MODE(data.Contract)_Month-to-month	SeniorCitizen.MODE(data.Dependents)_No	SeniorCitizen.MODE(data.DeviceProtection)_No	SeniorCitizen.MODE(data.InternetService)_Fiber optic	SeniorCitizen.MODE(data.MultipleLines)_No	SeniorCitizen.MODE(data.MultipleLines)_Yes	SeniorCitizen.MODE(data.OnlineBackup)_No	SeniorCitizen.MODE(data.OnlineSecurity)_No	SeniorCitizen.MODE(data.PaperlessBilling)_Yes	SeniorCitizen.MODE(data.Partner)_No	SeniorCitizen.MODE(data.Partner)_Yes	SeniorCitizen.MODE(data.PaymentMethod)_Electronic check	SeniorCitizen.MODE(data.PhoneService)_Yes	SeniorCitizen.MODE(data.StreamingMovies)_No	SeniorCitizen.MODE(data.StreamingMovies)_Yes	SeniorCitizen.MODE(data.StreamingTV)_No	SeniorCitizen.MODE(data.StreamingTV)_Yes	SeniorCitizen.MODE(data.TechSupport)_No
customerID
1794-HBQTJ	0	1	48.6	48.6	1.000000	48.600000	1.00000	48.600000	0.020576	0.020576	1158	118.60	8684.80	72	62.389983	2219.692358	32.658031	18.7	0.00	0	3	2	3	3	3	3	3	2	2	4	2	3	3	3	-0.070659	1.072957	0.228395	30.601714	2278.817867	24.658823	72247.60	2570403.75	37818	1	0	1	0	0	1	1	0	0	1	0	0	0	0	1	1	0	0	1	0	0	1	0	0	1	0	0	1	0	0	1	0	0	0	1	0	1	0	0	1	1	1	1	1	0	1	1	1	1	0	1	1	1	0	1	0	1
0356-OBMAC	1	56	99.9	5706.3	0.017507	1.783929	57.12012	101.898214	0.560561	0.009814	251	117.35	8436.25	72	79.820120	2727.039641	32.314741	19.2	19.45	1	3	2	3	3	3	3	3	2	2	4	2	3	3	3	-0.974773	0.663470	0.237100	22.771121	2395.833841	24.655882	20034.85	684486.95	8111	1	0	1	0	0	1	0	0	1	0	1	0	1	0	0	0	0	1	0	0	1	0	0	1	0	0	1	1	0	0	0	0	1	0	1	1	0	0	0	1	1	1	1	0	1	1	1	1	0	1	1	1	0	1	0	1	1

	customerID	ypreds_featuretools_lr	yprobs_featuretools_lr
0	1794-HBQTJ	0	0.445886
1	0356-OBMAC	0	0.078101
2	4077-CROMM	1	0.508567
3	5442-PPTJY	0	0.022152
4	2333-KWEWW	0	0.019473

Modelling Customer Churn

Load the libraries

Colab¶

Useful Scripts

Load the Data

Data Processing

Create Features using featuretools

Modelling

Model Evaluation

Time Taken

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	1621-YNCJH	Female	Yes	No	36	Yes	Yes	Fiber optic	Yes	Yes	Yes	Yes	No	Yes	Two year	Yes	Credit card (automatic)	106.05	3834.4	No
1	7143-BQIBA	Male	No	No	10	Yes	No	DSL	Yes	No	No	Yes	Yes	No	Month-to-month	No	Bank transfer (automatic)	62.25	612.95	No
5632	0862-PRCBS	Female	Yes	Yes	68	Yes	Yes	Fiber optic	No	Yes	No	Yes	Yes	Yes	Two year	Yes	Credit card (automatic)	103.75	7039.45	No
5633	4656-CAURT	Male	No	No	69	Yes	Yes	No	No internet service	No internet service	No internet service	No internet service	No internet service	No internet service	Two year	No	Bank transfer (automatic)	23.95	1713.1	No