import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
!pip install catboost
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
print('Environment: Google Colab')
sys.path.append("/Users/poudel/Dropbox/a00_Resources/hyperband")
try:
from search import HyperbandSearchCV
print('File found: search.py')
except:
print('File not found: search.py')
try:
from hyperband_search import HyperbandSearchCV
print('File found: hyperband_search.py')
except:
print('File not found: hyperband_search.py')
File not found: search.py File found: hyperband_search.py
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm, trange
import plotly_express as px
# modelling
import sklearn.metrics as skmetrics
from sklearn.model_selection import StratifiedKFold
# boosting
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
joblib 0.17.0 numpy 1.19.4 pandas 1.1.4 plotly_express 0.4.1 seaborn 0.11.0 lightgbm 2.3.1 catboost 0.23.2 autopep8 1.5.2 xgboost 1.2.0 json 2.0.9
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
def get_profit(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)
df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})
df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
index_name = 'customerID'
ser_train_ids = df_Xtrain.pop(index_name)
ser_test_ids = df_Xtest.pop(index_name)
df_Xtrain.head(2)
gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 |
1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
cols_num = list(df_train.select_dtypes('number').columns)
cols_num
['tenure', 'MonthlyCharges', 'TotalCharges']
cols_cat = list(df_train.select_dtypes('object').columns)
# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']
print(cols_cat)
['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
cols_num = ['TotalCharges','tenure', 'MonthlyCharges']
cols_num_old = cols_num
cols_cat_old = cols_cat
def combine_two_features(dfx,A,B):
dfx = dfx.copy()
assert len(A) == len(B)
for a,b in zip(A,B):
dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]
return dfx
combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
'TechSupport','PaymentMethod']
cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]
cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)
df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)
['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']
def create_groupby_features(dfx,cat,num,agg):
dfx = dfx.copy()
for c in cat:
for n in num:
for a in agg:
name = f"{c}_{n}_{a}"
dfx[name] = df_train.groupby(c)[n].transform(a)
return dfx
# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']
cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']
cols_num_new = [f'{c}_{n}_{a}'
for c in cols_grpcat
for n in cols_grpnum
for a in cols_grpagg]
cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)
df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)
['Contract_TotalCharges_mean']
df_Xtrain.head(2)
gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Credit card (automatic) | 3683.643192 |
1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No_No | No_No | No_No | No_Month-to-month | No_Yes | No_Bank transfer (automatic) | 1370.923131 |
cols_drop = ['gender']
df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)
all_features = df_Xtrain.columns.tolist()
cols_cat_idx = [all_features.index(i)
for i in cols_cat]
# make sure no nans
df_Xtrain.isna().sum().sum(), df_Xtest.isna().sum().sum()
(0, 0)
df_Xtrain_full = df_Xtrain.copy()
ser_ytrain_full = ser_ytrain.copy()
ytrain_full = np.array(ser_ytrain_full).flatten()
from sklearn.model_selection import train_test_split
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_full, ser_ytrain_full,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_full)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"df_train : {df_train.shape}\n")
print(f"df_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test : {df_test.shape}")
print(f"ser_ytest : This does not exist.")
df_Xtrain.head(2)
df_train : (5634, 21) df_Xtrain : (4507, 25) ser_ytrain : (4507,) df_Xvalid : (1127, 25) ser_yvalid : (1127,) df_test : (1409, 21) ser_ytest : This does not exist.
SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4555 | No | No | No | 16 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Credit card (automatic) | 19.75 | 294.90 | No_No | No_No | No_No | No_Month-to-month | No_No internet service | No_Credit card (automatic) | 1370.923131 |
3379 | No | Yes | No | 72 | No | No phone service | DSL | Yes | Yes | Yes | Yes | Yes | Yes | Two year | Yes | Electronic check | 64.70 | 4746.05 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Electronic check | 3683.643192 |
Regression Objectives:
MAE MAPE Poisson Quantile RMSE Huber Tweedie SMAPE R2 MSLE etc.
Classification Objectives:
Logloss CrossEntroy Precision Recall F1 BalancedAccuracy
Multiclassification objectives:
MultiClass MultiClassOneVsAll Precision Recall F1 TotalF1 MCC
Accuracy HingeLoss ZeroOneLoss Kappa WKappa AUC
#============================================================
catboost.CatBoostClassifier(
iterations = None, # n_estimators, num_trees, num_boost_round
learning_rate = None, # eta
depth = None, # max_depth
l2_leaf_reg = None, # reg_lambda
scale_pos_weight = None,
random_seed = None, # random_state
use_best_model = None,
verbose = None, # verbose_eval
silent = None,
logging_level = None, # silent verbose info debug
ignored_features = None,
cat_features = None, # indices or names
text_features = None,
one_hot_max_size = None,
objective = None, # loss_function
custom_loss = None,
custom_metric = None,
eval_metric = None,
score_function = None, # Cosine L2 NewtonCosine NewtonL2
subsample = None,
colsample_bylevel = None,
early_stopping_rounds = None,
grow_policy = None,
classes_count = None,
class_weights = None, # list dict {0:1.0, 1:0.5}
set 1 for zero, then weight = sum_neg/sum_pos for class one.
Do not use this parameter with auto_class_weights and scale_pos_weight.
auto_class_weights = None,
class_names = None,
save_snapshot = None,
snapshot_file = None,
snapshot_interval = None
)
#===========================================================
from catboost.utils import eval_metric
from math import log
labels = [1, 0, 1]
probabilities = [0.4, 0.1, 0.9]
# In binary classification it is necessary to apply the logit function
# to the probabilities to get approxes.
logit = lambda x: log(x / (1 - x))
approxes = list(map(logit, probabilities))
accuracy = eval_metric(labels, approxes, 'Accuracy')
#======================================================
class LoglossMetric(object):
def get_final_error(self, error, weight):
return error / (weight + 1e-38)
def is_max_optimal(self):
return False
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
approx = approxes[0]
error_sum = 0.0
weight_sum = 0.0
for i in range(len(approx)):
e = np.exp(approx[i])
p = e / (1 + e)
w = 1.0 if weight is None else weight[i]
weight_sum += w
error_sum += -w * (target[i] * np.log(p) + (1 - target[i]) * np.log(1 - p))
return error_sum, weight_sum
model = CatBoostClassifier(eval_metric=LoglossMetric())
Catboost classifier fit
catboost.CatBoostClassifier.fit(X,y,
cat_features = None,
text_features = None,
sample_weight = None,
baseline = None,
use_best_model = None,
eval_set = None,
verbose = None,
logging_level = None,
plot = False,
column_description = None,
verbose_eval = None,
metric_period = None,
silent = None,
early_stopping_rounds = None,
save_snapshot = None,
snapshot_file = None,
snapshot_interval = None,
init_model = None,
# https://stackoverflow.com/questions/65462220/how-to-create-custom-eval-metric-for-catboost
from sklearn.metrics import confusion_matrix
from scipy.special import expit
class ProfitMetric:
@classmethod
def get_profit(self, y_true, log_odd):
y_true = y_true.astype(int)
# catboost gives outputs as raw log_odds,
# we need to get prediction from it.
# logit(p) = log(p/(1-p))
# expit(x) = 1/(1+exp(-x))
y_pred = expit(log_odd).astype(int)
#print("ACCURACY:",(y_pred==y_true).mean())
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
def is_max_optimal(self):
return True # greater is better
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
# for binary classification, len(approxed)=1
assert len(target) == len(approxes[0])
y_true = np.array(target).astype(int)
log_odd = approxes[0]
score = self.get_profit(y_true, log_odd)
output_weight = 1 # weight is not used
return score, output_weight
def get_final_error(self, error, weight):
return error
from catboost import CatBoostClassifier
# CatBoostClassifier?
import catboost
show_methods(catboost)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | CatBoost | EFstrType | Pool | to_regressor |
1 | CatBoostClassifier | FeaturesData | core | train |
2 | CatBoostError | MetricVisualizer | cv | version |
3 | CatBoostRegressor | MultiRegressionCustomMetric | sum_models | widget |
4 | CatboostError | MultiRegressionCustomObjective | to_classifier |
show_methods(catboost.CatBoostClassifier)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | best_iteration_ | get_best_iteration | get_test_evals | random_seed_ |
1 | best_score_ | get_best_score | get_text_feature_indices | randomized_search |
2 | calc_feature_statistics | get_borders | get_tree_leaf_counts | save_borders |
3 | calc_leaf_indexes | get_cat_feature_indices | grid_search | save_model |
4 | classes_ | get_evals_result | is_fitted | score |
5 | compare | get_feature_importance | iterate_leaf_indexes | set_feature_names |
6 | copy | get_leaf_values | learning_rate_ | set_leaf_values |
7 | create_metric_calcer | get_leaf_weights | load_model | set_params |
8 | drop_unused_features | get_metadata | plot_partial_dependence | set_scale_and_bias |
9 | eval_metrics | get_object_importance | plot_predictions | shrink |
10 | evals_result_ | get_param | plot_tree | staged_predict |
11 | feature_importances_ | get_params | predict | staged_predict_log_proba |
12 | feature_names_ | get_scale_and_bias | predict_log_proba | staged_predict_proba |
13 | fit | get_test_eval | predict_proba | tree_count_ |
14 | get_all_params |
# catboost.CatBoostClassifier.fit?
catboost.CatBoostClassifier().fit
<bound method CatBoostClassifier.fit of <catboost.core.CatBoostClassifier object at 0x7fe2b4b19a90>>
df_Xtrain.head()
SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4555 | No | No | No | 16 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Credit card (automatic) | 19.75 | 294.90 | No_No | No_No | No_No | No_Month-to-month | No_No internet service | No_Credit card (automatic) | 1370.923131 |
3379 | No | Yes | No | 72 | No | No phone service | DSL | Yes | Yes | Yes | Yes | Yes | Yes | Two year | Yes | Electronic check | 64.70 | 4746.05 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Electronic check | 3683.643192 |
1713 | No | Yes | No | 67 | Yes | Yes | Fiber optic | No | Yes | Yes | Yes | Yes | Yes | One year | Yes | Credit card (automatic) | 109.70 | 7344.45 | Yes_No | No_No | No_Yes | No_One year | No_Yes | No_Credit card (automatic) | 3018.965636 |
2399 | Yes | Yes | No | 47 | Yes | Yes | Fiber optic | No | No | Yes | No | Yes | Yes | Month-to-month | No | Electronic check | 99.70 | 4747.20 | Yes_No | Yes_No | Yes_Yes | Yes_Month-to-month | Yes_No | Yes_Electronic check | 1370.923131 |
1096 | No | Yes | No | 46 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | Two year | No | Credit card (automatic) | 40.40 | 1842.70 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Credit card (automatic) | 3683.643192 |
print('cat_features: ', sorted(cols_cat_idx))
cat_features: [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23]
model = CatBoostClassifier(
n_estimators=1000,
random_state=SEED,
cat_features=cols_cat_idx,
scale_pos_weight=4,
eval_metric=ProfitMetric()
)
model.fit(df_Xtrain,ser_ytrain,plot=True,verbose=False,
eval_set=(df_Xvalid,ser_yvalid),
use_best_model=True,
early_stopping_rounds=50
)
<catboost.core.CatBoostClassifier at 0x7fe2b4f913d0>
vdpreds = model.predict(df_Xvalid)
vdprobs2d = model.predict_proba(df_Xvalid)
vdpreds = vdprobs2d[:,1].astype(int)
yvalid = np.array(ser_yvalid)
print(confusion_matrix(yvalid, vdpreds))
profit = get_profit(yvalid,vdpreds)
print(f'validation profit = ${profit:,d}')
[[828 0] [299 0]] validation profit = $-59,800
ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
model_eval_bin('catboost',ytest,ypreds,yprobs2d,show_plots=False)
profit = get_profit(ytest,ypreds)
print(f'test profit = ${profit:,d}')
precision recall f1-score support 0 0.74 0.47 0.57 1035 1 0.27 0.56 0.37 374 accuracy 0.49 1409 macro avg 0.51 0.51 0.47 1409 weighted avg 0.62 0.49 0.52 1409 [[483 552] [166 208]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
catboost | 0.4904 | 0.2737 | 0.5561 | 0.3668 | 0.5114 |
test profit = $-5,200
history = model.get_evals_result()
print(history.keys())
dict_keys(['learn', 'validation'])
metric_name_ = list(history['learn'].keys())[0]
metric_name_
'Logloss'
len(history['learn'][metric_name_]) # out of 1000 only 71 trees were built.
51
print(f" training profit = {np.mean(history['learn']['ProfitMetric']):,.0f}")
print(f" validation profit = {np.mean(history['validation']['ProfitMetric']):,.0f}")
training profit = -239,200 validation profit = -59,800
We generally should optimize model complexity and then tune the convergence.
Parameters:
n_estimators
: number of trees in the forest.learning_rate
: step size shrinkage used to prevent overfitting. Range is [0,1]
Note: catboost automatically selects the learning rate.
depth
: determines how deeply each tree is allowed to grow during any boosting round.subsample
: percentage of samples used per tree. Low value can lead to underfitting.colsample_bylevel
: The percentage of features to be used in each split selection. This helps us control overfitting and the values range from (0,1].WARNING:
colsample_bytree
parameter in catboost.For optuna,
study.optimize(
n_trials = None,
timeout = None,
n_jobs = 1,
catch = (),
callbacks = None,
gc_after_trial = False,
show_progress_bar = False
)
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress
from optuna.pruners import SuccessiveHalvingPruner
show_methods(optuna)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Any | delete_study | load_study | structs |
1 | Study | distributions | logging | study |
2 | TYPE_CHECKING | exceptions | multi_objective | trial |
3 | Trial | get_all_study_summaries | progress_bar | type_checking |
4 | TrialPruned | importance | pruners | types |
5 | create_study | importlib | samplers | version |
6 | create_trial | integration | storages | visualization |
7 | dashboard |
params_optuna_study = dict(
direction='maximize',
sampler=optuna.samplers.TPESampler(seed=SEED),
study_name='catboost_optuna',
storage='sqlite:///catboost_optuna_churn.db',
load_if_exists=True,
pruner=optuna.pruners.SuccessiveHalvingPruner(min_resource=100)
)
study = optuna.create_study(**params_optuna_study)
n_studies = len(study.trials)
print(f'Number of finished trials: {n_studies}')
Number of finished trials: 57
path_early_stop_dict = '../artifacts/catboost_optuna_early_stop_dict.joblib'
if n_studies == 0:
early_stop_dict = {}
else:
early_stop_dict = joblib.load(path_early_stop_dict)
print('last study early stopping rounds\n'+'='*35)
print(early_stop_dict[n_studies-1])
last study early stopping rounds =================================== [146, 146, 146, 146, 146]
def objective_no_skf(trial):
global early_stop_dict
params_cat_optuna = {
'n_estimators': trial.suggest_int('n_estimators', 100,2000),
#'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
# usually catboost automatically select best learning rate.
'max_depth': trial.suggest_int('max_depth', 3, 12),
'scale_pos_weight': trial.suggest_int('scale_pos_weight', 2,20),
'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1),
# sampling rows and columns
# in catboost theres in no colsample_bytree but colsample_bylevel
'subsample': trial.suggest_uniform('subsample', 0.6, 1),
'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.6, 1),
'used_ram_limit': '3gb'
}
# fit the model
model = CatBoostClassifier(random_state=SEED,
cat_features=cols_cat_idx,
**params_cat_optuna)
model.fit(df_Xtrain, ser_ytrain,
eval_set=[(df_Xvalid, ser_yvalid)],
use_best_model=True,
verbose=0,
early_stopping_rounds=100)
# save early stopping dictionary
history = model.get_evals_result()
metric_name_ = list(history['learn'].keys())[0]
n_rounds = len(history['learn'][metric_name_])
early_stop_dict[objective.i] = n_rounds
joblib.dump(early_stop_dict, path_early_stop_dict)
ypreds = model.predict(df_Xvalid)
ypreds = np.rint(ypreds)
#score = skmetrics.roc_auc_score(ser_yvalid.to_numpy().ravel(),ypreds)
score = get_profit(ser_yvalid.to_numpy().ravel(),ypreds)
# counter to update early stopping dict
objective.i +=1
return score
def objective(trial): # this is slow but more stable.
global early_stop_dict
params_cat_optuna = {
'n_estimators': trial.suggest_int('n_estimators', 100,2000),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
'max_depth': trial.suggest_int('max_depth', 3, 12),
'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [2,3,4,5]),
'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1),
'subsample': trial.suggest_uniform('subsample', 0.6, 1),
'used_ram_limit': '3gb'
}
# skf is more time-consuming but more stable.
skf = StratifiedKFold(n_splits=5,random_state=SEED,shuffle=True)
scores = []
lst_early_rounds = []
for idx_tr, idx_vd in skf.split(df_Xtrain_full, ser_ytrain_full):
Xtr,Xvd = df_Xtrain_full.iloc[idx_tr], df_Xtrain_full.iloc[idx_vd]
ytr,yvd = ser_ytrain_full[idx_tr], ser_ytrain_full.iloc[idx_vd]
model = CatBoostClassifier(random_state=SEED,
cat_features=cols_cat_idx,
**params_cat_optuna)
model.fit(Xtr, ytr,
eval_set=[(Xvd, yvd)],
use_best_model=False,
verbose=0,
early_stopping_rounds=100)
# save early stopping dictionary
history = model.get_evals_result()
metric_name_ = list(history['learn'].keys())[0]
lst_early_rounds.append(len(history['learn'][metric_name_]))
ypreds = model.predict(Xvd)
ypreds = np.rint(ypreds)
#score_ = skmetrics.roc_auc_score(ser_yvalid.to_numpy().ravel(),ypreds)
score_ = get_profit(yvd.to_numpy().ravel(),ypreds)
scores.append(score_)
#==============================================================
score = np.mean(scores) # sometimes we can also use np.max
# counter to update early stopping dict
early_stop_dict[objective.i] = lst_early_rounds
joblib.dump(early_stop_dict, path_early_stop_dict)
objective.i +=1
return score
hasattr(objective,'i')
False
%%time
# NOTE: there is inherent non-determinism in optuna hyperparameter selection
# we may not get the same hyperparameters when run twice.
if not hasattr(objective,'i'):
objective.i = len(study.trials)
N_TRIALS = 1 # make it large
study.optimize(objective, n_trials=N_TRIALS,timeout=600)
CPU times: user 53.1 s, sys: 2.58 s, total: 55.7 s Wall time: 18.4 s
show_methods(study)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | add_trial | enqueue_trial | set_system_attr | system_attrs |
1 | best_params | get_trials | set_user_attr | trials |
2 | best_trial | optimize | stop | trials_dataframe |
3 | best_value | pruner | study_name | user_attrs |
4 | direction | sampler |
# study.get_trials()
# FrozenTrial starting from number=0 ,1, ...
study.best_trial
FrozenTrial(number=50, value=68060.0, datetime_start=datetime.datetime(2020, 12, 30, 10, 8, 59, 396461), datetime_complete=datetime.datetime(2020, 12, 30, 10, 9, 35, 502818), params={'learning_rate': 0.01188013991897388, 'max_depth': 3, 'n_estimators': 1719, 'reg_lambda': 0.2746130196964879, 'scale_pos_weight': 5, 'subsample': 0.9449631695153495}, distributions={'learning_rate': LogUniformDistribution(high=1.0, low=0.01), 'max_depth': IntUniformDistribution(high=12, low=3, step=1), 'n_estimators': IntUniformDistribution(high=2000, low=100, step=1), 'reg_lambda': UniformDistribution(high=1, low=0.01), 'scale_pos_weight': CategoricalDistribution(choices=(2, 3, 4, 5)), 'subsample': UniformDistribution(high=1, low=0.6)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=51, state=TrialState.COMPLETE)
lst_best_n_estimators = early_stop_dict[study.best_trial.number]
best_n_estimators = np.max(early_stop_dict[study.best_trial.number])
lst_best_n_estimators, best_n_estimators
([1077, 926, 1214, 1053, 1078], 1214)
%%time
# Resume from last time
N_TRIALS = 10 # make it large
study = optuna.create_study(**params_optuna_study)
study.optimize(objective,
n_trials=N_TRIALS,
timeout=600,
show_progress_bar=True)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/optuna/progress_bar.py:46: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
CPU times: user 12min 28s, sys: 42.6 s, total: 13min 11s Wall time: 4min 34s
# %%time
# # Resume from last time
# N_TRIALS = 10 # make it large eg. 10, 20, 100
# for _ in trange(N_TRIALS):
# study = optuna.create_study(**params_optuna_study)
# study.optimize(objective, n_trials=1)
show_methods(optuna.visualization)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | is_available | plot_edf | plot_optimization_history | plot_param_importances |
1 | plot_contour | plot_intermediate_values | plot_parallel_coordinate | plot_slice |
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_param_importances(study)
fig = optuna.visualization.plot_parallel_coordinate(study)
fig['layout']['width'] = 800
fig.show()
optuna.visualization.plot_slice(study,
params=['learning_rate','max_depth']
)
optuna.visualization.plot_contour(study,params=['learning_rate','max_depth'])
print(f'Number of finished trials: {len(study.trials)}')
# best trial
best_trial = study.best_trial
# best params (note: even if we fix random_state, params changes each time.)
params_best = study.best_trial.params
params_best
Number of finished trials: 68
{'learning_rate': 0.01188013991897388, 'max_depth': 3, 'n_estimators': 1719, 'reg_lambda': 0.2746130196964879, 'scale_pos_weight': 5, 'subsample': 0.9449631695153495}
notes = """
{'learning_rate': 0.01147263253168174,
'max_depth': 4,
'n_estimators': 1511,
'reg_lambda': 0.18891811085825794,
'scale_pos_weight': 5,
'subsample': 0.811440063402815} # gives $84,500
"""
# 1. each time we run, we get different best params, optuna is not deterministic.
# 2. For some reason with 21 round I got 84k but with 31 rounds $82k.
# This is because these hyperparams will overfit the test set, it will not
# generalize for other validation folds. Therefore, use skf objective.
# this gives profit = $84,800
# I got this without using skf in optuna objective
# params_best = {'colsample_bylevel': 0.8028576058235254,
# 'max_depth': 3,
# 'n_estimators': 1258,
# 'reg_lambda': 0.9775888228189168,
# 'scale_pos_weight': 10,
# 'subsample': 0.6068141443822124}
model = CatBoostClassifier(**params_best,cat_features=cols_cat_idx,
verbose=False,random_state=SEED)
model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
model_eval_bin('catboost+optuna',ytest,ypreds,yprobs2d,show_plots=False)
profit = get_profit(ytest,ypreds)
print(f"profit = ${profit:,d}")
precision recall f1-score support 0 0.94 0.63 0.75 1035 1 0.46 0.89 0.61 374 accuracy 0.70 1409 macro avg 0.70 0.76 0.68 1409 weighted avg 0.81 0.70 0.71 1409 [[648 387] [ 42 332]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
catboost+optuna | 0.6955 | 0.4618 | 0.8877 | 0.6075 | 0.7569 |
profit = $85,700
model_eval_bin('catboost+optuna',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support 0 0.94 0.63 0.75 1035 1 0.46 0.89 0.61 374 accuracy 0.70 1409 macro avg 0.70 0.76 0.68 1409 weighted avg 0.81 0.70 0.71 1409 [[648 387] [ 42 332]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
catboost+optuna | 0.6955 | 0.4618 | 0.8877 | 0.6075 | 0.7569 |
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:86: FutureWarning: Function plot_roc_curve is deprecated; This will be removed in v0.5.0. Please use scikitplot.metrics.plot_roc instead.
import shap
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
df_Xtest.head(2)
SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | No | No | No | 1 | Yes | No | DSL | Yes | No | No | No | No | No | Month-to-month | Yes | Credit card (automatic) | 48.6 | 48.6 | No_No | No_No | No_No | No_Month-to-month | No_No | No_Credit card (automatic) | 3683.643192 |
1 | Yes | No | No | 56 | Yes | Yes | Fiber optic | No | Yes | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 99.9 | 5706.3 | No_No | Yes_No | Yes_No | Yes_Two year | Yes_Yes | Yes_Bank transfer (automatic) | 1370.923131 |
# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
shap_values[idx,:],
df_Xtest.iloc[idx,:],
matplotlib=False,
text_rotation=90)
# for this row, the predicted label is ...
# red features makes it higher
# blue features makes it smaller.
shap.summary_plot(shap_values, df_Xtest)
shap.summary_plot(shap_values, df_Xtest, plot_type='bar')
shap.dependence_plot(ind='TotalCharges', interaction_index='tenure',
shap_values=shap_values,
features=df_Xtest,
display_features=df_Xtest)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 5 min 23 secs