References
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
!pip install alibi
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm, trange
import plotly_express as px
# modelling
import sklearn
import sklearn.metrics as skmetrics
from sklearn.model_selection import StratifiedKFold
# boosting
import xgboost as xgb
import lightgbm as lgb
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
TF version: 2.4.0 Eager execution enabled: False alibi 0.5.5 joblib 0.17.0 pandas 1.1.4 numpy 1.19.4 seaborn 0.11.0 json 2.0.9 xgboost 1.2.0 autopep8 1.5.4 sklearn 0.23.2 lightgbm 3.1.1 plotly_express 0.4.1 tensorflow 2.4.0
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
def get_profit(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)
def set_seed(s=100):
np.random.seed(s)
tf.random.set_seed(s)
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
sys.path.append('../src')
import util as bp_util
from sklearn.model_selection import train_test_split
df_train = bp_util.clean_data(df_train)
df_test = bp_util.clean_data(df_test)
df_train.head(2)
customerID | tenure | MonthlyCharges | TotalCharges | Churn | Contract_TotalCharges_mean | Contract_TotalCharges_mean_diff | PaymentMethod_MonthlyCharges_mean | PaymentMethod_MonthlyCharges_mean_diff | MultipleLines_Ordinal | SeniorCitizen_Not_SenCit | SeniorCitizen_SeniorCitizen | Partner_No_Partner | Partner_Partner | Dependents_Dependents | Dependents_No_Dependents | PaperlessBilling_No_PaperlessBill | PaperlessBilling_PaperlessBill | PhoneService_No_PhoneService | PhoneService_PhoneService | OnlineSecurity_No internet service | OnlineSecurity_No_OnlineSecurity | OnlineSecurity_OnlineSecurity | OnlineBackup_No internet service | OnlineBackup_No_OnlineBackup | OnlineBackup_OnlineBackup | DeviceProtection_DeviceProtection | DeviceProtection_No internet service | DeviceProtection_No_DeviceProtection | TechSupport_No internet service | TechSupport_No_TechSupport | TechSupport_TechSupport | StreamingTV_No internet service | StreamingTV_No_StreamingTV | StreamingTV_StreamingTV | StreamingMovies_No internet service | StreamingMovies_No_StreamingMov | StreamingMovies_StreamingMov | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | Partner_Dependents_No_Partner_Dependents | Partner_Dependents_No_Partner_No_Dependents | Partner_Dependents_Partner_Dependents | Partner_Dependents_Partner_No_Dependents | SeniorCitizen_Dependents_Not_SenCit_Dependents | SeniorCitizen_Dependents_Not_SenCit_No_Dependents | SeniorCitizen_Dependents_SeniorCitizen_Dependents | SeniorCitizen_Dependents_SeniorCitizen_No_Dependents | SeniorCitizen_Partner_Not_SenCit_No_Partner | SeniorCitizen_Partner_Not_SenCit_Partner | SeniorCitizen_Partner_SeniorCitizen_No_Partner | SeniorCitizen_Partner_SeniorCitizen_Partner | SeniorCitizen_Contract_Not_SenCit_Month-to-month | SeniorCitizen_Contract_Not_SenCit_One year | SeniorCitizen_Contract_Not_SenCit_Two year | SeniorCitizen_Contract_SeniorCitizen_Month-to-month | SeniorCitizen_Contract_SeniorCitizen_One year | SeniorCitizen_Contract_SeniorCitizen_Two year | SeniorCitizen_TechSupport_Not_SenCit_No internet service | SeniorCitizen_TechSupport_Not_SenCit_No_TechSupport | SeniorCitizen_TechSupport_Not_SenCit_TechSupport | SeniorCitizen_TechSupport_SeniorCitizen_No internet service | SeniorCitizen_TechSupport_SeniorCitizen_No_TechSupport | SeniorCitizen_TechSupport_SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod_Not_SenCit_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Not_SenCit_Credit card (automatic) | SeniorCitizen_PaymentMethod_Not_SenCit_Electronic check | SeniorCitizen_PaymentMethod_Not_SenCit_Mailed check | SeniorCitizen_PaymentMethod_SeniorCitizen_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_SeniorCitizen_Credit card (automatic) | SeniorCitizen_PaymentMethod_SeniorCitizen_Electronic check | SeniorCitizen_PaymentMethod_SeniorCitizen_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | 36 | 106.05 | 3834.40 | No | 3683.643192 | 150.756808 | 65.801934 | 40.248066 | 2 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 7143-BQIBA | 10 | 62.25 | 612.95 | No | 1370.923131 | -757.973131 | 67.564819 | -5.314819 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
index_name = 'customerID'
target_name = 'Churn'
ser_train_ids = df_train.pop(index_name)
ser_test_ids = df_test.pop(index_name)
m = {'Yes':1, 'No':0}
ser_ytrain = df_train.pop(target_name).map(m)
ser_ytest = df_test.pop(target_name).map(m)
df_Xtrain_full = df_train
df_Xtest = df_test
ser_ytrain_full = ser_ytrain
ser_ytest = ser_ytest
Xtrain_full = df_Xtrain_full.to_numpy()
Xtest = df_Xtest.to_numpy()
ytrain_full = np.array(ser_ytrain_full)
ytest = np.array(ser_ytest)
df_Xtrain,df_Xvalid,ser_ytrain,ser_yvalid = train_test_split(
df_Xtrain_full,ser_ytrain_full,
random_state=SEED,
train_size=0.8
)
Xtrain = df_Xtrain.to_numpy()
Xvalid = df_Xvalid.to_numpy()
ytrain = np.array(ser_ytrain).flatten()
yvalid = np.array(ser_yvalid).flatten()
model_name = 'lightgbm'
from lightgbm import LGBMClassifier
metric_profit_name = 'profit'
def metric_profit(y_true, y_prob):
y_true = np.array(y_true).astype(int)
y_pred = np.rint(y_prob)
profit = get_profit(y_true,y_pred)
greater_is_better = True
return metric_profit_name, profit, greater_is_better
params = {'colsample_bytree': 0.7614216209026772, 'learning_rate': 0.816821855221229, 'max_bin': 114, 'max_depth': 27, 'min_child_samples': 411, 'min_child_weight': 2.1524026408064625e-05, 'min_data_in_bin': 71, 'min_split_gain': 3.4, 'n_estimators': 350, 'num_leaves': 466, 'reg_alpha': 7.08190801243234e-05, 'reg_lambda': 0, 'scale_pos_weight': 7, 'subsample': 0.571824428670002}
params['random_state'] = SEED
model = LGBMClassifier(**params)
model.fit(df_Xtrain_full,ytrain_full,verbose=0)
ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
profit = get_profit(ytest,ypreds)
print(f'test profit = ${profit:,d}')
model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=False)
test profit = $83,600 precision recall f1-score support 0 0.95 0.52 0.67 1035 1 0.41 0.93 0.57 374 accuracy 0.63 1409 macro avg 0.68 0.72 0.62 1409 weighted avg 0.81 0.63 0.64 1409 [[537 498] [ 27 347]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
lightgbm | 0.6274 | 0.4107 | 0.9278 | 0.5693 | 0.7233 |
import lofo
show_methods(lofo)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Dataset | dataset | lofo_importance | plotting |
1 | FLOFOImportance | flofo_importance | plot_importance | utils |
2 | LOFOImportance | infer_defaults |
X = pd.concat([df_Xtrain_full,ser_ytrain_full],axis=1)
X.iloc[-2:,-2:]
SeniorCitizen_PaymentMethod_SeniorCitizen_Mailed check | Churn | |
---|---|---|
5632 | 0 | 0 |
5633 | 0 | 0 |
# for lofo, we need data with target
cols_all = df_Xtrain_full.columns.tolist()
dataset = lofo.Dataset(X, target=target_name,
features=cols_all)
skf = StratifiedKFold(n_splits=5,random_state=SEED, shuffle=True)
scoring = sklearn.metrics.make_scorer(get_profit)
model = lgb.LGBMClassifier(**params)
lofo_imp = lofo.LOFOImportance(dataset, cv=skf, scoring=scoring,
model=model,
n_jobs=-1,
fit_params=None
)
%%time
df_imp = lofo_imp.get_importance()
CPU times: user 1min 18s, sys: 780 ms, total: 1min 18s Wall time: 21.3 s
df_imp.head(2)
feature | importance_mean | importance_std | val_imp_0 | val_imp_1 | val_imp_2 | val_imp_3 | val_imp_4 | |
---|---|---|---|---|---|---|---|---|
28 | TechSupport_No_TechSupport | 1300.0 | 1499.333185 | 900 | -400 | 400 | 4000 | 1600 |
66 | SeniorCitizen_PaymentMethod_Not_SenCit_Electronic check | 660.0 | 2541.338230 | 1600 | -2300 | 600 | 4900 | -1500 |
lofo.plot_importance(df_imp, figsize=(12,30))
df_imp[df_imp.importance_mean.between(-10,10)]
feature | importance_mean | importance_std | val_imp_0 | val_imp_1 | val_imp_2 | val_imp_3 | val_imp_4 | |
---|---|---|---|---|---|---|---|---|
46 | SeniorCitizen_Dependents_SeniorCitizen_Dependents | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
50 | SeniorCitizen_Partner_SeniorCitizen_No_Partner | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
61 | SeniorCitizen_TechSupport_SeniorCitizen_No internet service | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
63 | SeniorCitizen_TechSupport_SeniorCitizen_TechSupport | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
51 | SeniorCitizen_Partner_SeniorCitizen_Partner | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
71 | SeniorCitizen_PaymentMethod_SeniorCitizen_Mailed check | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
40 | Partner_Dependents_No_Partner_Dependents | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
57 | SeniorCitizen_Contract_SeniorCitizen_Two year | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
68 | SeniorCitizen_PaymentMethod_SeniorCitizen_Bank transfer (automatic) | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
69 | SeniorCitizen_PaymentMethod_SeniorCitizen_Credit card (automatic) | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
70 | SeniorCitizen_PaymentMethod_SeniorCitizen_Electronic check | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
56 | SeniorCitizen_Contract_SeniorCitizen_One year | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 |
cols_bad = (df_imp[df_imp.importance_mean.between(-10,10)]
['feature'].tolist()
)
# print(cols_bad)
cols_bad = [
'SeniorCitizen_Dependents_SeniorCitizen_Dependents',
'SeniorCitizen_Partner_SeniorCitizen_No_Partner',
'SeniorCitizen_TechSupport_SeniorCitizen_No internet service',
'SeniorCitizen_TechSupport_SeniorCitizen_TechSupport',
'SeniorCitizen_Partner_SeniorCitizen_Partner',
'SeniorCitizen_PaymentMethod_SeniorCitizen_Mailed check',
'Partner_Dependents_No_Partner_Dependents',
'SeniorCitizen_Contract_SeniorCitizen_Two year',
'SeniorCitizen_PaymentMethod_SeniorCitizen_Bank transfer (automatic)',
'SeniorCitizen_PaymentMethod_SeniorCitizen_Credit card (automatic)',
'SeniorCitizen_PaymentMethod_SeniorCitizen_Electronic check',
'SeniorCitizen_Contract_SeniorCitizen_One year'
]
cols_selected = [i for i in cols_all if i not in cols_bad]
model = lgb.LGBMClassifier(**params)
X2 = df_Xtrain_full[cols_selected]
model.fit(X2,ytrain_full)
ypreds = model.predict(df_Xtest[cols_selected])
print(f"profit excluding bad cols: ${get_profit(ytest,ypreds):,d}")
profit excluding bad cols: $83,500
model = lgb.LGBMClassifier(**params)
model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)
print(f"profit keeping bad cols: ${get_profit(ytest,ypreds):,d}")
profit keeping bad cols: $83,600