References
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
!pip install catboost
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm, trange
import plotly_express as px
# modelling
import sklearn.metrics as skmetrics
from sklearn.model_selection import StratifiedKFold
# boosting
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
seaborn 0.11.0 joblib 0.17.0 xgboost 1.2.0 plotly_express 0.4.1 autopep8 1.5.2 json 2.0.9 pandas 1.1.4 lightgbm 2.3.1 numpy 1.19.4 catboost 0.23.2
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
def get_profit(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)
df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})
df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
index_name = 'customerID'
ser_train_ids = df_Xtrain.pop(index_name)
ser_test_ids = df_Xtest.pop(index_name)
df_Xtrain.head(2)
gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 |
1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
cols_num = list(df_train.select_dtypes('number').columns)
cols_num
['tenure', 'MonthlyCharges', 'TotalCharges']
cols_cat = list(df_train.select_dtypes('object').columns)
# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']
print(cols_cat)
['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
cols_num = ['TotalCharges','tenure', 'MonthlyCharges']
cols_num_old = cols_num
cols_cat_old = cols_cat
def combine_two_features(dfx,A,B):
dfx = dfx.copy()
assert len(A) == len(B)
for a,b in zip(A,B):
dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]
return dfx
combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
'TechSupport','PaymentMethod']
cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]
cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)
df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)
['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']
def create_groupby_features(dfx,cat,num,agg):
dfx = dfx.copy()
for c in cat:
for n in num:
for a in agg:
name = f"{c}_{n}_{a}"
dfx[name] = df_train.groupby(c)[n].transform(a)
return dfx
# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']
cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']
cols_num_new = [f'{c}_{n}_{a}'
for c in cols_grpcat
for n in cols_grpnum
for a in cols_grpagg]
cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)
df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)
['Contract_TotalCharges_mean']
df_Xtrain.head(2)
gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Credit card (automatic) | 3683.643192 |
1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No_No | No_No | No_No | No_Month-to-month | No_Yes | No_Bank transfer (automatic) | 1370.923131 |
cols_drop = ['gender']
df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)
all_features = df_Xtrain.columns.tolist()
cols_cat_idx = [all_features.index(i)
for i in cols_cat]
# make sure no nans
df_Xtrain.isna().sum().sum(), df_Xtest.isna().sum().sum()
(0, 0)
df_Xtrain_full = df_Xtrain.copy()
ser_ytrain_full = ser_ytrain.copy()
ytrain_full = np.array(ser_ytrain_full).flatten()
# one hot encode
df_Xtrain_full = pd.get_dummies(df_Xtrain_full,columns=cols_cat)
df_Xtest = pd.get_dummies(df_Xtest,columns=cols_cat)
df_Xtrain_full.head()
tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | Partner_No | Partner_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Dependents_No | Dependents_Yes | PaperlessBilling_No | PaperlessBilling_Yes | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | PhoneService_No | PhoneService_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | SeniorCitizen_No | SeniorCitizen_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 36 | 106.05 | 3834.40 | 3683.643192 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
1 | 10 | 62.25 | 612.95 | 1370.923131 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
2 | 25 | 19.15 | 477.60 | 1370.923131 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
3 | 7 | 20.00 | 137.60 | 1370.923131 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
4 | 24 | 20.30 | 459.95 | 1370.923131 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
# check if all nunique >= 2
df_Xtrain_full.apply(pd.Series.nunique).nsmallest(5)
SeniorCitizen_Partner_No_No 2 SeniorCitizen_Partner_No_Yes 2 SeniorCitizen_Partner_Yes_No 2 SeniorCitizen_Partner_Yes_Yes 2 SeniorCitizen_TechSupport_No_No 2 dtype: int64
# check if all nunique >= 2
df_Xtest.apply(pd.Series.nunique).nsmallest(5)
SeniorCitizen_Partner_No_No 2 SeniorCitizen_Partner_No_Yes 2 SeniorCitizen_Partner_Yes_No 2 SeniorCitizen_Partner_Yes_Yes 2 SeniorCitizen_TechSupport_No_No 2 dtype: int64
# check if all are numbers
df_Xtrain_full.sum().sum(), df_Xtest.sum().sum()
(26266765.14999999, 6655873.894557063)
# check for nans
df_Xtrain_full.isna().sum().sum(), df_Xtest.isna().sum().sum()
(0, 0)
from sklearn.model_selection import train_test_split
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_full, ser_ytrain_full,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_full)
Xtrain_full = df_Xtrain_full.to_numpy()
Xtrain = df_Xtrain.to_numpy()
Xvalid = df_Xvalid.to_numpy()
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"df_train : {df_train.shape}\n")
print(f"df_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test : {df_test.shape}")
print(f"ser_ytest : This does not exist.")
df_Xtrain.head(2)
df_train : (5634, 21) df_Xtrain : (4507, 77) ser_ytrain : (4507,) df_Xvalid : (1127, 77) ser_yvalid : (1127,) df_test : (1409, 21) ser_ytest : This does not exist.
tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | Partner_No | Partner_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Dependents_No | Dependents_Yes | PaperlessBilling_No | PaperlessBilling_Yes | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | PhoneService_No | PhoneService_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | SeniorCitizen_No | SeniorCitizen_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4555 | 16 | 19.75 | 294.90 | 1370.923131 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
3379 | 72 | 64.70 | 4746.05 | 3683.643192 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
lgb.LGBMClassifier(
boosting_type = 'gbdt',
num_leaves = 31,
max_depth = -1,
learning_rate = 0.1,
n_estimators = 100,
subsample_for_bin = 200000,
objective = None,
class_weight = None,
min_split_gain = 0.0,
min_child_weight = 0.001,
min_child_samples = 20,
subsample = 1.0,
subsample_freq = 0,
colsample_bytree = 1.0,
reg_alpha = 0.0,
reg_lambda = 0.0,
random_state = None,
n_jobs = -1,
silent = True,
importance_type = 'split',
**kwargs,
)
model.fit(
sample_weight = None,
init_score = None,
eval_set = None,
eval_names = None,
eval_sample_weight = None,
eval_class_weight = None,
eval_init_score = None,
eval_metric = None,
early_stopping_rounds = None,
verbose = True,
feature_name = 'auto',
categorical_feature = 'auto',
callbacks = None
)
model_name = 'lightgbm'
hpo_name = 'optuna'
from lightgbm import LGBMClassifier
model = LGBMClassifier(random_state=SEED,n_estimators=1000)
model.fit(df_Xtrain,ytrain,
eval_set=(df_Xvalid, ser_yvalid),
early_stopping_rounds=20,
verbose=0,
)
ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
profit = get_profit(ytest,ypreds)
print(f'test profit = ${profit:,d}')
model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=False)
test profit = $-21,200 precision recall f1-score support 0 0.78 0.93 0.85 1035 1 0.59 0.27 0.37 374 accuracy 0.76 1409 macro avg 0.69 0.60 0.61 1409 weighted avg 0.73 0.76 0.72 1409 [[965 70] [273 101]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
lightgbm | 0.7566 | 0.5906 | 0.2701 | 0.3706 | 0.6012 |
show_methods(model)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | best_iteration_ | fit | n_estimators | reg_alpha |
1 | best_score_ | get_params | n_features_ | reg_lambda |
2 | booster_ | importance_type | n_jobs | score |
3 | boosting_type | learning_rate | num_leaves | set_params |
4 | class_weight | max_depth | objective | silent |
5 | classes_ | min_child_samples | objective_ | subsample |
6 | colsample_bytree | min_child_weight | predict | subsample_for_bin |
7 | evals_result_ | min_split_gain | predict_proba | subsample_freq |
8 | feature_importances_ | n_classes_ | random_state |
e = model.evals_result_
out = """
{'valid_0': OrderedDict([('binary_logloss',
[0.5495546455956963,
0.5287290543184567,
...]
"""
k0 = list(e.keys())[0]
k1 = list(e[k0].keys())[0]
print(e[k0][k1][:2])
#n_used = len(e['valid_0']['binary_logloss']) # only these trees are used
n_used = len(e[k0][k1])
print('early stop used: ', n_used)
[0.5495546455956963, 0.5287290543184567] early stop used: 48
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress
from optuna.pruners import SuccessiveHalvingPruner
show_methods(optuna)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Any | delete_study | load_study | structs |
1 | Study | distributions | logging | study |
2 | TYPE_CHECKING | exceptions | multi_objective | trial |
3 | Trial | get_all_study_summaries | progress_bar | type_checking |
4 | TrialPruned | importance | pruners | types |
5 | create_study | importlib | samplers | version |
6 | create_trial | integration | storages | visualization |
7 | dashboard |
show_methods(optuna.trial.Trial)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | datetime_start | report | suggest_categorical | suggest_loguniform |
1 | distributions | set_system_attr | suggest_discrete_uniform | suggest_uniform |
2 | number | set_user_attr | suggest_float | system_attrs |
3 | params | should_prune | suggest_int | user_attrs |
params_optuna_study = dict(
direction='maximize',
sampler=optuna.samplers.TPESampler(seed=SEED),
study_name=f'{model_name}_{hpo_name}',
storage='sqlite:///' + model_name + f'_{hpo_name}_churn.db',
load_if_exists=True,
pruner=optuna.pruners.SuccessiveHalvingPruner(min_resource=100)
)
study = optuna.create_study(**params_optuna_study)
n_studies = len(study.trials)
print(f'Number of finished trials: {n_studies}')
Number of finished trials: 0
path_early_stop_dict = f'../artifacts/{model_name}_{hpo_name}_early_stop_dict.joblib'
if n_studies == 0:
early_stop_dict = {}
else:
early_stop_dict = joblib.load(path_early_stop_dict)
print('last study early stopping rounds\n'+'='*35)
print(early_stop_dict[n_studies-1])
def objective(trial): # this is slow but more stable.
global early_stop_dict
params_lgb_optuna = {
'n_estimators': trial.suggest_int('n_estimators', 100,5000),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
'max_depth': trial.suggest_int('max_depth', 3, 16),
'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [2,3,4,5,7,8,9,10]),
'reg_alpha' : trial.suggest_uniform('reg_alpha', 0.01, 1),
'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1),
'subsample' : trial.suggest_uniform('subsample', 0.5, 1),
'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
}
# skf is more time-consuming but more stable.
skf = StratifiedKFold(n_splits=5,random_state=SEED,shuffle=True)
scores = []
lst_early_rounds = []
for idx_tr, idx_vd in skf.split(df_Xtrain_full, ser_ytrain_full):
Xtr,Xvd = df_Xtrain_full.iloc[idx_tr], df_Xtrain_full.iloc[idx_vd]
ytr,yvd = ser_ytrain_full[idx_tr], ser_ytrain_full.iloc[idx_vd]
model = lgb.LGBMClassifier(random_state=SEED,**params_lgb_optuna)
model.fit(Xtr, ytr,
eval_set=[(Xvd, yvd)],
verbose=0,
early_stopping_rounds=100)
# save early stopping dictionary
e = model.evals_result_
k0 = list(e.keys())[0]
k1 = list(e[k0].keys())[0]
n_used = len(e[k0][k1]) # only these trees are used
lst_early_rounds.append(n_used)
ypreds = model.predict(Xvd)
ypreds = np.rint(ypreds)
#score_ = skmetrics.roc_auc_score(ser_yvalid.to_numpy().ravel(),ypreds)
score_ = get_profit(yvd.to_numpy().ravel(),ypreds)
scores.append(score_)
#==============================================================
score = np.mean(scores) # sometimes we can also use np.max
# counter to update early stopping dict
early_stop_dict[objective.i] = lst_early_rounds
joblib.dump(early_stop_dict, path_early_stop_dict)
objective.i +=1
return score
hasattr(objective,'i')
False
%%time
# NOTE: there is inherent non-determinism in optuna hyperparameter selection
# we may not get the same hyperparameters when run twice.
if not hasattr(objective,'i'):
objective.i = len(study.trials)
N_TRIALS = 1 # make it large
study.optimize(objective, n_trials=N_TRIALS,timeout=600)
print(f'Number of finished trials: {len(study.trials)}')
params_best = study.best_trial.params
model = LGBMClassifier(**params_best,verbose=0,random_state=SEED)
model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)
profit = get_profit(ytest,ypreds)
print(f"profit = ${profit:,d}")
Number of finished trials: 1 profit = $18,100 CPU times: user 57 s, sys: 2.7 s, total: 59.7 s Wall time: 24.9 s
show_methods(study)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | add_trial | enqueue_trial | set_system_attr | system_attrs |
1 | best_params | get_trials | set_user_attr | trials |
2 | best_trial | optimize | stop | trials_dataframe |
3 | best_value | pruner | study_name | user_attrs |
4 | direction | sampler |
# study.get_trials()
# FrozenTrial starting from number=0 ,1, ...
study.best_trial
FrozenTrial(number=0, value=44220.0, datetime_start=datetime.datetime(2020, 12, 30, 13, 58, 53, 719337), datetime_complete=datetime.datetime(2020, 12, 30, 13, 58, 57, 943567), params={'colsample_bytree': 0.7263699514296151, 'learning_rate': 0.112918456991973, 'max_depth': 13, 'n_estimators': 4027, 'reg_alpha': 0.1651439692851411, 'reg_lambda': 0.1946025113317489, 'scale_pos_weight': 9, 'subsample': 0.6050538712212918}, distributions={'colsample_bytree': UniformDistribution(high=1, low=0.5), 'learning_rate': LogUniformDistribution(high=1.0, low=0.01), 'max_depth': IntUniformDistribution(high=16, low=3, step=1), 'n_estimators': IntUniformDistribution(high=5000, low=100, step=1), 'reg_alpha': UniformDistribution(high=1, low=0.01), 'reg_lambda': UniformDistribution(high=1, low=0.01), 'scale_pos_weight': CategoricalDistribution(choices=(2, 3, 4, 5, 7, 8, 9, 10)), 'subsample': UniformDistribution(high=1, low=0.5)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1, state=TrialState.COMPLETE)
lst_best_n_estimators = early_stop_dict[study.best_trial.number]
best_n_estimators = np.max(early_stop_dict[study.best_trial.number])
lst_best_n_estimators, best_n_estimators
([104, 104, 104, 104, 104], 104)
%%time
# Resume from last time
N_TRIALS = 100 # make it large
study = optuna.create_study(**params_optuna_study)
study.optimize(objective,
n_trials=N_TRIALS,
timeout=60*10, # 60 means 1 minutes
show_progress_bar=True)
params_best = study.best_trial.params
print(f'Number of finished trials: {len(study.trials)}')
print(params_best)
print()
model = LGBMClassifier(**params_best,verbose=0,random_state=SEED)
model.fit(df_Xtrain,ytrain)
vdpreds = model.predict(df_Xvalid)
profit = get_profit(yvalid,vdpreds)
print(f"validation profit = ${profit:,d}")
# test
model = LGBMClassifier(**params_best,verbose=0,random_state=SEED)
model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)
profit = get_profit(ytest,ypreds)
print(f"test profit = ${profit:,d}")
out = """
Number of finished trials: 101
{'colsample_bytree': 0.8340780751883098, 'learning_rate': 0.6075408640727997, 'max_depth': 10, 'n_estimators': 3553, 'reg_alpha': 0.6850804855842425, 'reg_lambda': 0.7721552432132202, 'scale_pos_weight': 7, 'subsample': 0.9578772811050695}
validation profit = $29,400
test profit = $22,200
-------------------------------------------------------------------------
Number of finished trials: 202
{'colsample_bytree': 0.8340780751883098, 'learning_rate': 0.6075408640727997, 'max_depth': 10, 'n_estimators': 3553, 'reg_alpha': 0.6850804855842425, 'reg_lambda': 0.7721552432132202, 'scale_pos_weight': 7, 'subsample': 0.9578772811050695}
validation profit = $32,400
test profit = $18,200
When I run more iterations, validation improved but test decreased.
We should never look at test set because we don't have ytest labels.
We should check standard deviation of valid splits, and do more iterations.
"""
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/optuna/progress_bar.py:46: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
Number of finished trials: 101 {'colsample_bytree': 0.7313627571689273, 'learning_rate': 0.5060823593539764, 'max_depth': 15, 'n_estimators': 2914, 'reg_alpha': 0.06466622093877408, 'reg_lambda': 0.32005752267806997, 'scale_pos_weight': 9, 'subsample': 0.9034618429683745} validation profit = $22,100 test profit = $17,500 CPU times: user 16min 7s, sys: 50 s, total: 16min 57s Wall time: 8min 8s
show_methods(optuna.visualization)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | is_available | plot_edf | plot_optimization_history | plot_param_importances |
1 | plot_contour | plot_intermediate_values | plot_parallel_coordinate | plot_slice |
optuna.visualization.plot_optimization_history(study)
%%time
# optuna.visualization.plot_param_importances(study)
CPU times: user 3 µs, sys: 1 µs, total: 4 µs Wall time: 5.96 µs
fig = optuna.visualization.plot_parallel_coordinate(study)
fig['layout']['width'] = 800
fig.show()
optuna.visualization.plot_slice(study,
params=['learning_rate','max_depth'])
# optuna.visualization.plot_contour(study,params=['learning_rate','max_depth'])
profit = get_profit(ytest,ypreds)
print(f"test profit = ${profit:,d}")
model_eval_bin(f'{model_name}+{hpo_name}',ytest,ypreds,yprobs2d,show_plots=True)
test profit = $17,500 precision recall f1-score support 0 0.82 0.84 0.83 1035 1 0.53 0.48 0.50 374 accuracy 0.75 1409 macro avg 0.67 0.66 0.67 1409 weighted avg 0.74 0.75 0.74 1409 [[872 163] [193 181]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
lightgbm+optuna | 0.7473 | 0.5262 | 0.4840 | 0.5042 | 0.6632 |
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:86: FutureWarning: Function plot_roc_curve is deprecated; This will be removed in v0.5.0. Please use scikitplot.metrics.plot_roc instead.
import shap
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
Setting feature_perturbation = "tree_path_dependent" because no background data was given. LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
df_Xtest.head(2)
tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | Partner_No | Partner_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Dependents_No | Dependents_Yes | PaperlessBilling_No | PaperlessBilling_Yes | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | PhoneService_No | PhoneService_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | SeniorCitizen_No | SeniorCitizen_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 48.6 | 48.6 | 3683.643192 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
1 | 56 | 99.9 | 5706.3 | 1370.923131 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
shap_values[idx,:],
df_Xtest.iloc[idx,:],
matplotlib=False,
text_rotation=90)
# for this row, the predicted label is ...
# red features makes it higher
# blue features makes it smaller.
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-62-0f954fe3c3ee> in <module> 3 idx = 0 4 shap.force_plot(explainer.expected_value, ----> 5 shap_values[idx,:], 6 df_Xtest.iloc[idx,:], 7 matplotlib=False, TypeError: list indices must be integers or slices, not tuple
shap.summary_plot(shap_values, df_Xtest)
shap.summary_plot(shap_values, df_Xtest, plot_type='bar')
shap.dependence_plot(ind='TotalCharges', interaction_index='tenure',
shap_values=shap_values,
features=df_Xtest,
display_features=df_Xtest)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))