References
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
# update modules
!pip uninstall xgboost
!pip install -U xgboost
print('Environment: Google Colab')
sys.path.append('/Users/poudel/Dropbox/a00_Resources/hyperband')
from hyperband_search import HyperbandSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
# modelling
from sklearn.preprocessing import OneHotEncoder
import imblearn
from imblearn.over_sampling import SMOTE
import sklearn.metrics as skmetrics
# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
# boosting
from lightgbm import LGBMClassifier
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
joblib 0.17.0 imblearn 0.7.0 json 2.0.9 numpy 1.19.4 pandas 1.1.4 seaborn 0.11.0 plotly_express 0.4.1 autopep8 1.5.2
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
class FrequencyEncoder:
def __init__(self, cols):
self.cols = cols
self.counts_dict = None
def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
counts_dict = {}
for col in self.cols:
values, counts = np.unique(X[col], return_counts=True)
counts_dict[col] = dict(zip(values, counts))
self.counts_dict = counts_dict
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
counts_dict_test = {}
res = []
for col in self.cols:
values, counts = np.unique(X[col], return_counts=True)
counts_dict_test[col] = dict(zip(values, counts))
# if value is in "train" keys - replace "test" counts with "train" counts
for k in [key for key in counts_dict_test[col].keys() if key in self.counts_dict[col].keys()]:
counts_dict_test[col][k] = self.counts_dict[col][k]
res.append(X[col].map(counts_dict_test[col]).values.reshape(-1, 1))
res = np.hstack(res)
X[self.cols] = res
return X
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
self.fit(X, y)
X = self.transform(X)
return X
def get_profit(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
import plotly_express as px
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)
df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})
df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
df_Xtrain.head(2)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 |
1 | 7143-BQIBA | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
cols_num = list(df_train.select_dtypes('number').columns)
cols_num
['tenure', 'MonthlyCharges', 'TotalCharges']
cols_cat = list(df_train.select_dtypes('object').columns)
# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']
print(cols_cat)
['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
cols_num = ['TotalCharges','tenure', 'MonthlyCharges']
cols_num_old = cols_num
cols_cat_old = cols_cat
def combine_two_features(dfx,A,B):
dfx = dfx.copy()
assert len(A) == len(B)
for a,b in zip(A,B):
dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]
return dfx
combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
'TechSupport','PaymentMethod']
cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]
cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)
df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)
['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']
def create_groupby_features(dfx,cat,num,agg):
dfx = dfx.copy()
for c in cat:
for n in num:
for a in agg:
name = f"{c}_{n}_{a}"
dfx[name] = df_train.groupby(c)[n].transform(a)
return dfx
# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']
cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']
cols_num_new = [f'{c}_{n}_{a}'
for c in cols_grpcat
for n in cols_grpnum
for a in cols_grpagg]
cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)
df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)
['Contract_TotalCharges_mean']
ColumnTransformer(
transformers,
*,
remainder='drop',
sparse_threshold=0.3,
n_jobs=None,
transformer_weights=None,
verbose=False,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
pipe_num = Pipeline([
# standard scaling
# ('scaler', StandardScaler())
# we can use yeo-johnson scaling for not-normal data instead of standard
('yeo_johnson', PowerTransformer())
])
pipe_cat = Pipeline([
('ohe', OneHotEncoder(handle_unknown='ignore'))
])
pipe_cat_freq = Pipeline([
('freq_enc', FrequencyEncoder(cols=cols_cat)),
])
preprocessor = ColumnTransformer(
transformers=[
('num', pipe_num, cols_num),
('cat', pipe_cat, cols_cat)
],
remainder='drop'
)
from sklearn import set_config
set_config(display='diagram')
preprocessor
ColumnTransformer(transformers=[('num', Pipeline(steps=[('yeo_johnson', PowerTransformer())]), ['Contract_TotalCharges_mean', 'tenure', 'MonthlyCharges', 'TotalCharges']), ('cat', Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown='ignore'))]), ['PaperlessBilling', 'SeniorCitizen_Contract', 'MultipleLines', 'SeniorCitizen', 'PhoneService', 'DeviceProtection', 'OnlineSecurity', 'StreamingMovies', 'SeniorCitizen_PaymentMethod', 'OnlineBackup', 'SeniorCitizen_Partner', 'Dependents', 'Partner_Dependents', 'Partner', 'StreamingTV', 'SeniorCitizen_Dependents', 'Contract', 'SeniorCitizen_TechSupport', 'TechSupport', 'InternetService', 'PaymentMethod'])])
['Contract_TotalCharges_mean', 'tenure', 'MonthlyCharges', 'TotalCharges']
PowerTransformer()
['PaperlessBilling', 'SeniorCitizen_Contract', 'MultipleLines', 'SeniorCitizen', 'PhoneService', 'DeviceProtection', 'OnlineSecurity', 'StreamingMovies', 'SeniorCitizen_PaymentMethod', 'OnlineBackup', 'SeniorCitizen_Partner', 'Dependents', 'Partner_Dependents', 'Partner', 'StreamingTV', 'SeniorCitizen_Dependents', 'Contract', 'SeniorCitizen_TechSupport', 'TechSupport', 'InternetService', 'PaymentMethod']
OneHotEncoder(handle_unknown='ignore')
from sklearn import set_config
set_config(display='text')
print(df_Xtrain.shape, df_Xtest.shape)
df_Xtrain.head(2)
(5634, 27) (1409, 27)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Credit card (automatic) | 3683.643192 |
1 | 7143-BQIBA | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No_No | No_No | No_No | No_Month-to-month | No_Yes | No_Bank transfer (automatic) | 1370.923131 |
preprocessor.fit(df_Xtrain)
ColumnTransformer(transformers=[('num', Pipeline(steps=[('yeo_johnson', PowerTransformer())]), ['Contract_TotalCharges_mean', 'tenure', 'MonthlyCharges', 'TotalCharges']), ('cat', Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown='ignore'))]), ['PaperlessBilling', 'SeniorCitizen_Contract', 'MultipleLines', 'SeniorCitizen', 'PhoneService', 'DeviceProtection', 'OnlineSecurity', 'StreamingMovies', 'SeniorCitizen_PaymentMethod', 'OnlineBackup', 'SeniorCitizen_Partner', 'Dependents', 'Partner_Dependents', 'Partner', 'StreamingTV', 'SeniorCitizen_Dependents', 'Contract', 'SeniorCitizen_TechSupport', 'TechSupport', 'InternetService', 'PaymentMethod'])])
Xtrain = preprocessor.transform(df_Xtrain)
Xtest = preprocessor.transform(df_Xtest)
Xtrain.shape, Xtest.shape
((5634, 77), (1409, 77))
lgb.LGBMClassifier(
boosting_type = 'gbdt',
num_leaves = 31,
max_depth = -1,
learning_rate = 0.1,
n_estimators = 100,
subsample_for_bin = 200000,
objective = None,
class_weight = None,
min_split_gain = 0.0,
min_child_weight = 0.001,
min_child_samples = 20,
subsample = 1.0,
subsample_freq = 0,
colsample_bytree = 1.0,
reg_alpha = 0.0,
reg_lambda = 0.0,
random_state = None,
n_jobs = -1,
silent = True,
importance_type = 'split',
**kwargs,
)
model_name = 'lightgbm'
hpo_name = 'hyperband'
from lightgbm import LGBMClassifier
model = LGBMClassifier(random_state=SEED)
model.fit(Xtrain,ytrain)
ypreds = model.predict(Xtest)
yprobs2d = model.predict_proba(Xtest)
model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=False)
profit = get_profit(ytest,ypreds)
print(f"test profit = ${profit:,d}")
precision recall f1-score support 0 0.78 0.92 0.85 1035 1 0.57 0.28 0.37 374 accuracy 0.75 1409 macro avg 0.67 0.60 0.61 1409 weighted avg 0.72 0.75 0.72 1409 [[956 79] [271 103]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
lightgbm | 0.7516 | 0.5659 | 0.2754 | 0.3705 | 0.5995 |
test profit = $-20,900
%%time
import scipy.stats as stats
from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)
# Define our model
params_fixed = dict(boosting_type = 'gbdt',random_state= SEED,n_jobs=1)
params_hyp = {
'max_depth' : stats.randint(3,12),
'learning_rate' : stats.loguniform(0.01, 1.0),
'n_estimators' : stats.randint(100, 1000),
'subsample' : [0.6, 0.7, 0.8, 0.9, 1.0],
'reg_alpha' : stats.loguniform(0.01, 1.0),
'reg_lambda' : stats.loguniform(0.01, 1.0),
'scale_pos_weight' : [1,2,3,4,5]
}
model = LGBMClassifier(**params_fixed)
# scoring ='roc_auc'
scoring = metrics.make_scorer(get_profit,greater_is_better=True)
# Perform Hyperparameter Tuning
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=SEED)
grid = HyperbandSearchCV(model,params_hyp,
resource_param = 'n_estimators',
min_iter = 100,
max_iter = 5000, # use 1k or 2k
cv = cv,
scoring = scoring,
refit = True,
verbose = 0,
random_state = SEED
)
# grid.fit(Xtrain, ytrain)
# print('Best parameters: ', grid.best_params_)
# params_best = grid.best_params_
# params = params_fixed
# params.update(params_best)
# print(params)
out = """
this gives profit = $51,300
params_best = {'boosting_type': 'gbdt', 'random_state': 100,
'n_jobs': 1, 'learning_rate': 0.026206211651810026,
'max_depth': 3, 'n_estimators': 333,
'reg_alpha': 0.016472282465672092,
'reg_lambda': 0.027503944643483897,
'scale_pos_weight': 4, 'subsample': 1.0}
--------------------------------
Best parameters:
Wall time: 29min 45s
params_best = {'boosting_type': 'gbdt', 'random_state': 100,
'n_jobs': 1,'learning_rate': 0.026206211651810026,
'max_depth': 3,'n_estimators': 185,
'reg_alpha': 0.016472282465672092,
'reg_lambda': 0.027503944643483897,
'scale_pos_weight': 4,
'subsample': 1.0}
this gives $48k
"""
Best parameters: {'learning_rate': 0.026206211651810026, 'max_depth': 3, 'n_estimators': 185, 'reg_alpha': 0.016472282465672092, 'reg_lambda': 0.027503944643483897, 'scale_pos_weight': 4, 'subsample': 1.0} {'boosting_type': 'gbdt', 'random_state': 100, 'n_jobs': 1, 'learning_rate': 0.026206211651810026, 'max_depth': 3, 'n_estimators': 185, 'reg_alpha': 0.016472282465672092, 'reg_lambda': 0.027503944643483897, 'scale_pos_weight': 4, 'subsample': 1.0} CPU times: user 26min 55s, sys: 6.89 s, total: 27min 2s Wall time: 29min 45s
params_best = {'boosting_type': 'gbdt', 'random_state': 100,
'n_jobs': 1, 'learning_rate': 0.026206211651810026,
'max_depth': 3, 'n_estimators': 333,
'reg_alpha': 0.016472282465672092,
'reg_lambda': 0.027503944643483897,
'scale_pos_weight': 4, 'subsample': 1.0}
model = LGBMClassifier(**params_best)
model.fit(Xtrain,ytrain)
ypreds = model.predict(Xtest)
yprobs2d = model.predict_proba(Xtest)
model_eval_bin(f'{model_name}+{hpo_name}',ytest,ypreds,yprobs2d,show_plots=False)
profit = get_profit(ytest,ypreds)
print(f"test profit = ${profit:,d}")
precision recall f1-score support 0 0.87 0.71 0.78 1035 1 0.47 0.70 0.56 374 accuracy 0.71 1409 macro avg 0.67 0.70 0.67 1409 weighted avg 0.76 0.71 0.72 1409 [[736 299] [114 260]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
lightgbm+hyperband | 0.7069 | 0.4651 | 0.6952 | 0.5573 | 0.7031 |
test profit = $51,300
model_eval_bin(f"{model_name}+{hpo_name}",ytest,ypreds,yprobs2d,show_plots=True)
profit = get_profit(ytest,ypreds)
print(f"test profit = ${profit:,d}")
precision recall f1-score support 0 0.87 0.71 0.78 1035 1 0.47 0.70 0.56 374 accuracy 0.71 1409 macro avg 0.67 0.70 0.67 1409 weighted avg 0.76 0.71 0.72 1409 [[736 299] [114 260]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
lightgbm+hyperband | 0.7069 | 0.4651 | 0.6952 | 0.5573 | 0.7031 |
test profit = $51,300
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 5 secs