References
NOTES:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
!pip install alibi
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm, trange
import plotly_express as px
# modelling
import sklearn
import sklearn.metrics as skmetrics
from sklearn.model_selection import StratifiedKFold
# boosting
import xgboost as xgb
import lightgbm as lgb
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
json 2.0.9 sklearn 0.23.2 lightgbm 3.1.1 xgboost 1.2.0 numpy 1.19.4 seaborn 0.11.0 autopep8 1.5.4 pandas 1.1.4 plotly_express 0.4.1 joblib 0.17.0
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
def get_profit(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)
def set_seed(s=100):
np.random.seed(s)
tf.random.set_seed(s)
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
sys.path.append('../src')
import util as bp_util
from sklearn.model_selection import train_test_split
df_train = bp_util.clean_data(df_train)
df_test = bp_util.clean_data(df_test)
df_train.head(2)
customerID | tenure | MonthlyCharges | TotalCharges | Churn | Contract_TotalCharges_mean | Contract_TotalCharges_mean_diff | PaymentMethod_MonthlyCharges_mean | PaymentMethod_MonthlyCharges_mean_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SeniorCitizen_Dependents_Not_SenCit_Dependents | SeniorCitizen_Dependents_Not_SenCit_No_Dependents | SeniorCitizen_Dependents_SeniorCitizen_Dependents | SeniorCitizen_Dependents_SeniorCitizen_No_Dependents | SeniorCitizen_Partner_Not_SenCit_No_Partner | SeniorCitizen_Partner_Not_SenCit_Partner | SeniorCitizen_Partner_SeniorCitizen_No_Partner | SeniorCitizen_Partner_SeniorCitizen_Partner | SeniorCitizen_Contract_Not_SenCit_Month-to-month | SeniorCitizen_Contract_Not_SenCit_One year | SeniorCitizen_Contract_Not_SenCit_Two year | SeniorCitizen_Contract_SeniorCitizen_Month-to-month | SeniorCitizen_Contract_SeniorCitizen_One year | SeniorCitizen_Contract_SeniorCitizen_Two year | SeniorCitizen_TechSupport_Not_SenCit_No internet service | SeniorCitizen_TechSupport_Not_SenCit_No_TechSupport | SeniorCitizen_TechSupport_Not_SenCit_TechSupport | SeniorCitizen_TechSupport_SeniorCitizen_No internet service | SeniorCitizen_TechSupport_SeniorCitizen_No_TechSupport | SeniorCitizen_TechSupport_SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod_Not_SenCit_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Not_SenCit_Credit card (automatic) | SeniorCitizen_PaymentMethod_Not_SenCit_Electronic check | SeniorCitizen_PaymentMethod_Not_SenCit_Mailed check | SeniorCitizen_PaymentMethod_SeniorCitizen_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_SeniorCitizen_Credit card (automatic) | SeniorCitizen_PaymentMethod_SeniorCitizen_Electronic check | SeniorCitizen_PaymentMethod_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | 36 | 106.05 | 3834.40 | No | 3683.643192 | 150.756808 | 65.801934 | 40.248066 | 2 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 7143-BQIBA | 10 | 62.25 | 612.95 | No | 1370.923131 | -757.973131 | 67.564819 | -5.314819 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
index_name = 'customerID'
target_name = 'Churn'
ser_train_ids = df_train.pop(index_name)
ser_test_ids = df_test.pop(index_name)
m = {'Yes':1, 'No':0}
ser_ytrain = df_train.pop(target_name).map(m)
ser_ytest = df_test.pop(target_name).map(m)
df_Xtrain_full = df_train
df_Xtest = df_test
ser_ytrain_full = ser_ytrain
ser_ytest = ser_ytest
Xtrain_full = df_Xtrain_full.to_numpy()
Xtest = df_Xtest.to_numpy()
ytrain_full = np.array(ser_ytrain_full)
ytest = np.array(ser_ytest)
df_Xtrain,df_Xvalid,ser_ytrain,ser_yvalid = train_test_split(
df_Xtrain_full,ser_ytrain_full,
random_state=SEED,
train_size=0.8
)
Xtrain = df_Xtrain.to_numpy()
Xvalid = df_Xvalid.to_numpy()
ytrain = np.array(ser_ytrain).flatten()
yvalid = np.array(ser_yvalid).flatten()
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
df_Xtrain.head(2)
tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | Contract_TotalCharges_mean_diff | PaymentMethod_MonthlyCharges_mean | PaymentMethod_MonthlyCharges_mean_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SeniorCitizen_Dependents_Not_SenCit_Dependents | SeniorCitizen_Dependents_Not_SenCit_No_Dependents | SeniorCitizen_Dependents_SeniorCitizen_Dependents | SeniorCitizen_Dependents_SeniorCitizen_No_Dependents | SeniorCitizen_Partner_Not_SenCit_No_Partner | SeniorCitizen_Partner_Not_SenCit_Partner | SeniorCitizen_Partner_SeniorCitizen_No_Partner | SeniorCitizen_Partner_SeniorCitizen_Partner | SeniorCitizen_Contract_Not_SenCit_Month-to-month | SeniorCitizen_Contract_Not_SenCit_One year | SeniorCitizen_Contract_Not_SenCit_Two year | SeniorCitizen_Contract_SeniorCitizen_Month-to-month | SeniorCitizen_Contract_SeniorCitizen_One year | SeniorCitizen_Contract_SeniorCitizen_Two year | SeniorCitizen_TechSupport_Not_SenCit_No internet service | SeniorCitizen_TechSupport_Not_SenCit_No_TechSupport | SeniorCitizen_TechSupport_Not_SenCit_TechSupport | SeniorCitizen_TechSupport_SeniorCitizen_No internet service | SeniorCitizen_TechSupport_SeniorCitizen_No_TechSupport | SeniorCitizen_TechSupport_SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod_Not_SenCit_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Not_SenCit_Credit card (automatic) | SeniorCitizen_PaymentMethod_Not_SenCit_Electronic check | SeniorCitizen_PaymentMethod_Not_SenCit_Mailed check | SeniorCitizen_PaymentMethod_SeniorCitizen_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_SeniorCitizen_Credit card (automatic) | SeniorCitizen_PaymentMethod_SeniorCitizen_Electronic check | SeniorCitizen_PaymentMethod_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
973 | 47 | 110.85 | 5275.8 | 3018.965636 | 2256.834364 | 65.801934 | 45.048066 | 2 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
54 | 47 | 58.60 | 2723.4 | 1370.923131 | 1352.476869 | 67.564819 | -8.964819 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# Define the columns we wish to transform
cols_scale = ['tenure', 'MonthlyCharges', 'TotalCharges',
'Contract_TotalCharges_mean',
'Contract_TotalCharges_mean_diff',
'PaymentMethod_MonthlyCharges_mean',
'PaymentMethod_MonthlyCharges_mean_diff']
# Scale the relevant columns
transformer = ColumnTransformer([('yeo_johnson', PowerTransformer(), cols_scale)],
remainder='passthrough')
transformer.fit(df_Xtrain)
df_Xtrain_scaled = pd.DataFrame(transformer.transform(df_Xtrain))
df_Xtest_scaled = pd.DataFrame(transformer.transform(df_Xtest))
#features
df_Xtrain_scaled.columns = df_Xtrain.columns
df_Xtest_scaled.columns = df_Xtest.columns
params = {'C': 0.42679058013626753, 'max_iter': 1000,
'penalty': 'l2', 'solver': 'lbfgs'}
model = LogisticRegression(**params)
model.fit(df_Xtrain, ser_ytrain)
ypreds = model.predict(df_Xtest)
profit = get_profit(ytest,ypreds)
print(f'profit without scaled = ${profit:,d}')
profit without scaled = $36,100
model = LogisticRegression(**params)
model.fit(df_Xtrain_scaled, ser_ytrain)
ypreds = model.predict(df_Xtest_scaled)
profit = get_profit(ytest,ypreds)
print(f'profit with scaled = ${profit:,d}')
profit with scaled = $37,200
import interpret
from interpret.data import ClassHistogram
show_methods(interpret)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | NullHandler | init_show_server | set_visualize_provider | status_show_server |
1 | api | logging | show | utils |
2 | data | preserve | show_link | version |
3 | get_show_addr | provider | shutdown_show_server | visual |
4 | get_visualize_provider | set_show_addr |
show_methods(interpret.data)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | ClassHistogram | Marginal | response |
hist = ClassHistogram().explain_data(df_Xtrain, ytrain, name = 'Train Data')
interpret.show(hist)
from interpret import glassbox
show_methods(glassbox)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | ClassificationTree | ExplainableBoostingRegressor | RegressionTree | linear |
1 | DecisionListClassifier | LinearRegression | decisiontree | skoperules |
2 | ExplainableBoostingClassifier | LogisticRegression | ebm |
model = glassbox.LogisticRegression(**params)
model.fit(df_Xtrain_scaled,ytrain)
<interpret.glassbox.linear.LogisticRegression at 0x7fd9ffdee110>
show_methods(model)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | X_maxs_ | categorical_uniq_ | feature_types | predict |
1 | X_mins_ | explain_global | fit | predict_proba |
2 | available_explanations | explain_local | global_selector | score |
3 | bin_counts_ | explainer_type | kwargs | sk_model_ |
4 | bin_edges_ | feature_names | linear_class |
model_global = model.explain_global(name='logistic regression')
interpret.show(model_global)
model_local = model.explain_local(
df_Xtest.iloc[:5], ytrain[:5],
name='logistic regression')
interpret.show(model_local)
from interpret import perf as in_perf
show_methods(in_perf,5)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | PR | ROC | RegressionPerf | curve | regression |
e = in_perf.ROC(model.predict_proba)
e = e.explain_perf(df_Xtest,ytest,
name='Logistic Regression')
interpret.show(e)
glassbox.ExplainableBoostingClassifier(
feature_names = None,
feature_types = None,
max_bins = 255,
max_interaction_bins = 32,
binning = 'quantile',
mains = 'all',
interactions = 0,
outer_bags = 16,
inner_bags = 0,
learning_rate = 0.01,
validation_size = 0.15,
early_stopping_rounds = 50,
early_stopping_tolerance = 0.0001,
max_rounds = 5000,
max_leaves = 3,
min_samples_leaf = 2,
n_jobs = -2,
random_state = 42,
)
# glassbox.ExplainableBoostingClassifier?
model_gbm = glassbox.ExplainableBoostingClassifier()
model_gbm.fit(df_Xtrain_scaled,ytrain);
lr_perf = (in_perf.ROC(model.predict_proba)
.explain_perf(df_Xtest, ytest,
name='Logistic Regression')
)
gbm_perf = (in_perf.ROC(model_gbm.predict_proba)
.explain_perf(df_Xtest, ytest,
name='Boosting')
)
lr_global = model.explain_global(name='Logistic Regression')
gbm_global = model_gbm.explain_global(name='Boosting')
interpret.show([hist,
lr_global, lr_perf,
gbm_global, gbm_perf],
share_tables=True)
from interpret import blackbox
show_methods(blackbox)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | LimeTabular | PartialDependence | lime | sensitivity |
1 | MorrisSensitivity | ShapKernel | partialdependence | shap |
cols_all = df_Xtrain.columns.tolist()
from interpret.blackbox import ShapKernel
background_val = np.median(Xtrain, axis=0).reshape(1, -1)
shap = ShapKernel(predict_fn=model.predict_proba,
data=background_val,
feature_names=cols_all)
shap_local = shap.explain_local(Xtest[:5], ytest[:5],
name='SHAP')
interpret.show(shap_local)
from interpret.blackbox import MorrisSensitivity
sensitivity = MorrisSensitivity(
predict_fn=model.predict_proba, data=Xtrain)
sensitivity_global = sensitivity.explain_global(
name="Global Sensitivity")
interpret.show(sensitivity_global)
from interpret.blackbox import PartialDependence
pdp = PartialDependence(
predict_fn=model.predict_proba, data=Xtrain)
pdp_global = pdp.explain_global(name='Partial Dependence')
interpret.show(pdp_global)
interpret.show([shap_local, sensitivity_global, pdp_global])