References
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
time_colab_start = time.time()
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# HPO for pycaret
!pip install tune-sklearn ray[tune] # search_library = 'sklearn_tune' dont work even if we install this
!pip install optuna # hyperopt is already in colab
# gpu version of lightgbm for pycaret
!pip uninstall lightgbm -y
!pip install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so"
# regular pycaret without gpu
!pip install pycaret-nightly[full]
# ipywidget
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
from pycaret.utils import enable_colab
enable_colab()
print('Environment: Google Colab')
time_colab = time.time() - time_colab_start
h,m = divmod(time_colab,60*60)
print('Time taken by colab: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken by colab: 0 hr 0 min 0 secs
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import joblib
from tqdm import tqdm_notebook as tqdm
# special
import pycaret
# settings
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
sys : 3.8.5 (default, Sep 4 2020, 02:22:02) [Clang 10.0.0 ] pandas : 1.1.5 numpy : 1.19.4 autopep8 : 1.5.4 seaborn : 0.11.0 pycaret : 2.2.2 matplotlib: 3.2.2 joblib : 1.0.0 json : 2.0.9
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def compare_new_models(name,desc,mean_row,ofile,
df_eval=None,sort='Recall',show=True):
"""Create dataframe from output of pycaret new model.
Parameters
-----------
name: str
Name of the model. eg. xgboost
desc: str
Description of the model. e.g tuned,calibrated
mean_arr: np.ndarray
The mean row.
e.g.
df_res = pyc.pull()
mean_row = df_res.loc['Mean']
ofile: str
Output file name. e.g. 'pycaret_df_eval_lr.csv'
df_eval: Pandas Dataframe
Template pandas dataframe
sort: str
One of following string: Accuracy, AUC, Recall, Precision, F1, Kappa
Returns:
Pandas Dataframe.
"""
if not isinstance(df_eval, pd.DataFrame):
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'AUC':[],
'Recall':[],
'Precision':[],
'F1':[],
'Kappa':[],
'MCC': [],
'LogLoss': []
})
acc,auc,rec,pre,f1,kap,mcc,logloss = mean_row
row = [name,desc,acc,auc,rec,pre,f1,kap,mcc,logloss]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()\
.sort_values(sort,ascending=False)
df_eval.index = range(len(df_eval))
df_style = (df_eval.style.apply(lambda ser:
['background: tomato'
if ser.name == sort else ''
for _ in ser]))
if show:
display(df_style)
# save the data
df_eval.to_csv(ofile)
return df_eval
path_data_train = '../data/processed/train_cleaned.csv'
path_data_test = '../data/processed/test_cleaned.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/processed/train_cleaned.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/processed/test_cleaned.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 39) (1409, 39)
Gender | SeniorCitizen | Partner | Dependents | Tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | Contract_Month-to-month | NoSeniorCitizen_Contract_Month-to-month | PaymentMethod0_Contract_Month-to-month0 | InternetService_Fiber optic | StreamingTV_NoInternetService | No_OB_DP_TS | TotalServices | SenCit_Dependents | Partner_Dependents | SenCit_Partner | SenCit_Contract | SenCit_TechSupport | SenCit_PayMeth | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | Tenure_cat | MonthlyCharges_cat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 36 | 1 | 2 | 1 | 2 | 2 | 2 | 2 | 0 | 2 | 2 | 1 | 1 | 106.05 | 3834.40 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 1 | 1 | 2 | 2 | 1 | 3683.643192 | 150.756808 | 66.703657 | 39.346343 | 3 | 0 |
1 | 1 | 0 | 0 | 0 | 10 | 1 | 0 | 0 | 2 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 62.25 | 612.95 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 1370.923131 | -757.973131 | 66.703657 | -4.453657 | 0 | 0 |
5632 | 0 | 0 | 1 | 1 | 68 | 1 | 2 | 1 | 0 | 2 | 0 | 2 | 2 | 2 | 2 | 1 | 1 | 103.75 | 7039.45 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 1 | 2 | 1 | 2 | 2 | 1 | 3683.643192 | 3355.806808 | 66.703657 | 37.046343 | 5 | 0 |
5633 | 1 | 0 | 0 | 0 | 69 | 1 | 2 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 0 | 1 | 23.95 | 1713.10 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 7 | 0 | 0 | 0 | 2 | 1 | 1 | 3683.643192 | -1970.543192 | 66.703657 | -42.753657 | 5 | 0 |
path_data_test_raw = ('https://raw.githubusercontent.com/'
'bhishanpdl/Datasets/master/Projects/'
'Telco_Customer_Churn/raw/test.csv')
df_test_raw1 = pd.read_csv(path_data_test_raw,usecols=['customerID'])
df_test_raw1.head(2)
customerID | |
---|---|
0 | 1794-HBQTJ |
1 | 0356-OBMAC |
ser_test_ids = df_test_raw1['customerID']
target_name = 'Churn'
# check for nans
df_train.isna().sum().sum()
0
import pycaret
import pycaret.classification as pyc
pyc.setup(
data: pandas.core.frame.DataFrame,
target: str,
train_size = 0.7,
test_data = None,
preprocess = True,
imputation_type = 'simple',
iterative_imputation_iters = 5,
categorical_features = None,
categorical_imputation = 'constant',
categorical_iterative_imputer = 'lightgbm',
ordinal_features = None,
high_cardinality_features = None,
high_cardinality_method = 'frequency',
numeric_features = None,
numeric_imputation = 'mean',
numeric_iterative_imputer = 'lightgbm',
date_features = None,
ignore_features = None,
normalize = False,
normalize_method = 'zscore',
transformation = False,
transformation_method = 'yeo-johnson',
handle_unknown_categorical = True,
unknown_categorical_method = 'least_frequent',
pca = False,
pca_method = 'linear',
pca_components = None,
ignore_low_variance = False,
combine_rare_levels = False,
rare_level_threshold = 0.1,
bin_numeric_features = None,
remove_outliers = False,
outliers_threshold = 0.05,
remove_multicollinearity = False,
multicollinearity_threshold = 0.9,
remove_perfect_collinearity = True,
create_clusters = False,
cluster_iter = 20,
polynomial_features = False,
polynomial_degree = 2,
trigonometry_features = False,
polynomial_threshold = 0.1,
group_features = None,
group_names = None,
feature_selection = False,
feature_selection_threshold = 0.8,
feature_selection_method = 'classic',
feature_interaction = False,
feature_ratio = False,
interaction_threshold = 0.01,
fix_imbalance = False,
fix_imbalance_method = None,
data_split_shuffle = True,
data_split_stratify = False,
fold_strategy = 'stratifiedkfold',
fold = 10,
fold_shuffle = False,
fold_groups = None,
n_jobs = -1,
use_gpu = False,
custom_pipeline = None,
html = True,
session_id = None,
log_experiment = False,
experiment_name = None,
log_plots = False,
log_profile = False,
log_data = False,
silent = False,
verbose = True,
profile = False,
profile_kwargs = None,
)
df_train.shape
(5634, 39)
show_methods(pyc)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Any | calibrate_model | interpret_model | pull |
1 | Dict | compare_models | io | pycaret |
2 | Display | create_model | is_in_colab | remove_metric |
3 | List | deploy_model | load_config | save_config |
4 | MLUsecase | enable_colab | load_model | save_model |
5 | Optional | ensemble_model | models | set_config |
6 | Tuple | evaluate_model | np | setup |
7 | Union | finalize_model | optimize_threshold | stack_models |
8 | add_metric | get_config | pd | traceback |
9 | automl | get_logs | plot_model | tune_model |
10 | blend_models | get_metrics | predict_model | warnings |
USE_GPU = False
if ENV_COLAB:
USE_GPU = True
print(USE_GPU)
False
exp = pyc.setup(df_train,target_name,
train_size=0.8,
session_id=SEED,
use_gpu=USE_GPU,
preprocess = True,
categorical_features = None,
ordinal_features = None,
high_cardinality_features = None,
numeric_features = None,
date_features = None,
ignore_features = None,
normalize = False,
data_split_stratify = True,
silent=True,
profile=False,
log_experiment=False,
polynomial_features=True,
# fix_imbalance=True, # gives attribute error
)
# use silent = True to check inferred datatypes
# then assign numeric and categorical features yourself.
#
# if sampling = False, 100% of data is used and its too slow
# if sampling = True, we need to enter number eg. 0.3 ourself.
"""
Here, we have data < 25k rows, so I have chosen not to use sampling.
""";
Description | Value | |
---|---|---|
0 | session_id | 100 |
1 | Target | Churn |
2 | Target Type | Binary |
3 | Label Encoded | 0: 0, 1: 1 |
4 | Original Data | (5634, 39) |
5 | Missing Values | False |
6 | Numeric Features | 6 |
7 | Categorical Features | 32 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (4507, 58) |
12 | Transformed Test Set | (1127, 58) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | True |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 07bf |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Clustering | False |
45 | Clustering Iteration | None |
46 | Polynomial Features | True |
47 | Polynomial Degree | 2 |
48 | Trignometry Features | False |
49 | Polynomial Threshold | 0.100000 |
50 | Group Features | False |
51 | Feature Selection | False |
52 | Features Selection Threshold | None |
53 | Feature Interaction | False |
54 | Feature Ratio | False |
55 | Interaction Threshold | None |
56 | Fix Imbalance | False |
57 | Fix Imbalance Method | SMOTE |
pyc.models(internal=True)[['Name', 'GPU Enabled']]
# google colab does not support cuml and thereby sklearn models
# we need to run blazingsql notebooks (not colab) to use cuml models.
Name | GPU Enabled | |
---|---|---|
ID | ||
lr | Logistic Regression | False |
knn | K Neighbors Classifier | False |
nb | Naive Bayes | False |
dt | Decision Tree Classifier | False |
svm | SVM - Linear Kernel | False |
rbfsvm | SVM - Radial Kernel | False |
gpc | Gaussian Process Classifier | False |
mlp | MLP Classifier | False |
ridge | Ridge Classifier | False |
rf | Random Forest Classifier | False |
qda | Quadratic Discriminant Analysis | False |
ada | Ada Boost Classifier | False |
gbc | Gradient Boosting Classifier | False |
lda | Linear Discriminant Analysis | False |
et | Extra Trees Classifier | False |
xgboost | Extreme Gradient Boosting | False |
lightgbm | Light Gradient Boosting Machine | False |
catboost | CatBoost Classifier | False |
Bagging | Bagging Classifier | False |
Stacking | Stacking Classifier | False |
Voting | Voting Classifier | False |
CalibratedCV | Calibrated Classifier CV | False |
Comparing All Models
accuracy
pyc.compare_models(
include = None,
exclude = None,
fold = None,
round = 4,
cross_validation = True,
sort = 'Accuracy',
n_select = 1,
budget_time = None,
turbo = True,
errors = 'ignore',
fit_kwargs = None,
groups = None,
verbose = True,
)
pyc.get_metrics().index
Index(['acc', 'auc', 'recall', 'precision', 'f1', 'kappa', 'mcc'], dtype='object', name='ID')
# add Log Loss metric in pycaret
from sklearn.metrics import log_loss
pyc.add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False)
Name LogLoss Display Name LogLoss Score Function <function log_loss at 0x7fb620bd0e50> Scorer make_scorer(log_loss, greater_is_better=False) Target pred Args {} Greater is Better False Multiclass True Custom True Name: logloss, dtype: object
best = pyc.compare_models(sort = 'AUC',fold=5)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | LogLoss | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|---|
gbc | Gradient Boosting Classifier | 0.8103 | 0.8474 | 0.5377 | 0.6816 | 0.6009 | 0.4787 | 0.4847 | 6.5520 | 0.4120 |
lr | Logistic Regression | 0.8083 | 0.8465 | 0.5552 | 0.6691 | 0.6061 | 0.4809 | 0.4851 | 6.6210 | 0.6320 |
ada | Ada Boost Classifier | 0.8141 | 0.8462 | 0.5577 | 0.6838 | 0.6142 | 0.4934 | 0.4980 | 6.4218 | 0.1400 |
lda | Linear Discriminant Analysis | 0.8110 | 0.8458 | 0.5552 | 0.6762 | 0.6095 | 0.4864 | 0.4907 | 6.5291 | 0.0320 |
catboost | CatBoost Classifier | 0.8003 | 0.8451 | 0.5268 | 0.6553 | 0.5836 | 0.4543 | 0.4593 | 6.8969 | 3.5660 |
lightgbm | Light Gradient Boosting Machine | 0.7941 | 0.8326 | 0.5301 | 0.6366 | 0.5776 | 0.4431 | 0.4469 | 7.1115 | 0.6360 |
xgboost | Extreme Gradient Boosting | 0.7870 | 0.8268 | 0.5193 | 0.6179 | 0.5641 | 0.4246 | 0.4275 | 7.3568 | 0.9380 |
rf | Random Forest Classifier | 0.7932 | 0.8265 | 0.4958 | 0.6442 | 0.5602 | 0.4280 | 0.4344 | 7.1420 | 0.2680 |
nb | Naive Bayes | 0.7131 | 0.8232 | 0.8244 | 0.4782 | 0.6048 | 0.4044 | 0.4414 | 9.9088 | 0.0200 |
et | Extra Trees Classifier | 0.7772 | 0.8053 | 0.5025 | 0.5962 | 0.5451 | 0.3990 | 0.4018 | 7.6938 | 0.2600 |
knn | K Neighbors Classifier | 0.7737 | 0.7819 | 0.4975 | 0.5902 | 0.5389 | 0.3905 | 0.3937 | 7.8165 | 0.0400 |
dt | Decision Tree Classifier | 0.7444 | 0.6836 | 0.5485 | 0.5177 | 0.5325 | 0.3569 | 0.3572 | 8.8281 | 0.0280 |
qda | Quadratic Discriminant Analysis | 0.6772 | 0.5962 | 0.4238 | 0.3970 | 0.4066 | 0.1870 | 0.1884 | 11.1502 | 0.0200 |
svm | SVM - Linear Kernel | 0.6716 | 0.0000 | 0.6169 | 0.4371 | 0.4613 | 0.2623 | 0.3013 | 11.3422 | 0.0400 |
ridge | Ridge Classifier | 0.8103 | 0.0000 | 0.5084 | 0.6972 | 0.5877 | 0.4683 | 0.4784 | 6.5521 | 0.0180 |
Estimator Abbreviated String Original Implementation
--------- ------------------ -------------------------------
Logistic Regression 'lr' linear_model.LogisticRegression
K Nearest Neighbour 'knn' neighbors.KNeighborsClassifier
Naives Bayes 'nb' naive_bayes.GaussianNB
Decision Tree 'dt' tree.DecisionTreeClassifier
SVM (Linear) 'svm' linear_model.SGDClassifier
SVM (RBF) 'rbfsvm' svm.SVC
Gaussian Process 'gpc' gaussian_process.GPC
Multi Level Perceptron 'mlp' neural_network.MLPClassifier
Ridge Classifier 'ridge' linear_model.RidgeClassifier
Random Forest 'rf' ensemble.RandomForestClassifier
Quadratic Disc. Analysis 'qda' discriminant_analysis.QDA
AdaBoost 'ada' ensemble.AdaBoostClassifier
Gradient Boosting 'gbc' ensemble.GradientBoostingClassifier
Linear Disc. Analysis 'lda' discriminant_analysis.LDA
Extra Trees Classifier 'et' ensemble.ExtraTreesClassifier
Extreme Gradient Boosting 'xgboost' xgboost.readthedocs.io
Light Gradient Boosting 'lightgbm' github.com/microsoft/LightGBM
CatBoost Classifier 'catboost' https://catboost.ai
pyc.create_model(
estimator,
fold = None,
round = 4,
cross_validation = True,
fit_kwargs = None,
groups = None,
verbose = True,
**kwargs,
)
model_name = 'lda'
path_df_eval = 'pycaret_df_eval_lda.csv'
model = pyc.create_model(model_name,verbose=False)
mean_row = pyc.pull().loc['Mean'].values
df_eval = compare_new_models(model_name,'default',mean_row,
path_df_eval,sort='AUC',df_eval=None)
Model | Description | Accuracy | AUC | Recall | Precision | F1 | Kappa | MCC | LogLoss | |
---|---|---|---|---|---|---|---|---|---|---|
0 | lda | default | 0.810300 | 0.845700 | 0.552700 | 0.675200 | 0.607200 | 0.483900 | 0.488500 | 6.552000 |
pyc.tune_model(
estimator,
fold = None,
round = 4,
n_iter = 10,
custom_grid = None,
optimize = 'Accuracy',
custom_scorer = None,
search_library = 'scikit-learn', # 'scikit-optimize', 'tune-sklearn','optuna'
search_algorithm = None, # 'scikit-learn', 'scikit-optimize', 'tune-sklearn', 'optuna'
early_stopping = False, # 'asha','hyperband','median'
early_stopping_max_iters = 10,
choose_better = False,
fit_kwargs = None,
groups = None,
return_tuner = False,
verbose = True,
tuner_verbose = True,
**kwargs,
)
%%time
# if model takes long time, comment this and save model later (keep it joblib.keep)
# takes long time: xgb lda
# takes short time: lr nb
model_tuned2 = pyc.tune_model(model,n_iter=500,search_library='optuna',
fold=5,optimize='AUC',verbose=False)
mean_row = pyc.pull().loc['Mean']
desc = 'tuned,optuna,n_iter=500'
df_eval = compare_new_models(model_name,desc,
mean_row,path_df_eval,sort='AUC',df_eval=df_eval)
Model | Description | Accuracy | AUC | Recall | Precision | F1 | Kappa | MCC | LogLoss | |
---|---|---|---|---|---|---|---|---|---|---|
0 | lda | default | 0.810300 | 0.845700 | 0.552700 | 0.675200 | 0.607200 | 0.483900 | 0.488500 | 6.552000 |
1 | lda | tuned,optuna,n_iter=500 | 0.793700 | 0.823400 | 0.517600 | 0.638000 | 0.571200 | 0.437300 | 0.441700 | 7.126800 |
CPU times: user 2min 57s, sys: 27.9 s, total: 3min 25s Wall time: 1min 41s
%%time
# if model takes long time, comment this and save model later (keep it joblib.keep)
# takes long time: xgb lda
# takes short time: lr nb
model_tuned_F1 = pyc.tune_model(model,n_iter=500,
search_library='optuna',
fold=5,optimize='F1',verbose=False)
mean_row = pyc.pull().loc['Mean']
desc = 'tuned,optuna,n_iter=500,optimize=F1'
df_eval = compare_new_models(model_name,desc,
mean_row,path_df_eval,sort='AUC',df_eval=df_eval)
# Wall time: 4min 36s
Model | Description | Accuracy | AUC | Recall | Precision | F1 | Kappa | MCC | LogLoss | |
---|---|---|---|---|---|---|---|---|---|---|
0 | lda | default | 0.810300 | 0.845700 | 0.552700 | 0.675200 | 0.607200 | 0.483900 | 0.488500 | 6.552000 |
1 | lda | tuned,optuna,n_iter=500 | 0.793700 | 0.823400 | 0.517600 | 0.638000 | 0.571200 | 0.437300 | 0.441700 | 7.126800 |
2 | lda | tuned,optuna,n_iter=500,optimize=F1 | 0.763700 | 0.817900 | 0.615400 | 0.550200 | 0.580700 | 0.417000 | 0.418500 | 8.161400 |
CPU times: user 3min 16s, sys: 17.3 s, total: 3min 33s Wall time: 2min 9s
# look at df_eval and find best model
model_best = model_tuned2
model_best_F1 = model_tuned_F1
odir = '.' if ENV_COLAB else '../models/'
path_model_best = odir + 'pycaret_model_best_' + model_name + '.joblib'
path_model_best_F1 = odir+'pycaret_model_best_F1_' + model_name + '.joblib'
joblib.dump(model_best, path_model_best)
joblib.dump(model_best_F1, path_model_best_F1)
model_best = joblib.load(path_model_best)
model_best_F1 = joblib.load(path_model_best_F1)
# after selecting best model, delete unwanted models
import gc
try: del model_tuned1
except: pass
gc.collect()
148
Abbreviated String Name
------------------ -------
* 'auc' - Area Under the Curve
* 'threshold' - Discrimination Threshold
* 'pr' - Precision Recall Curve
* 'confusion_matrix' - Confusion Matrix
* 'error' - Class Prediction Error
* 'class_report' - Classification Report
* 'boundary' - Decision Boundary
* 'rfe' - Recursive Feature Selection
* 'learning' - Learning Curve
* 'manifold' - Manifold Learning
* 'calibration' - Calibration Curve
* 'vc' - Validation Curve
* 'dimension' - Dimension Learning
* 'feature' - Feature Importance
* 'feature_all' - Feature Importance (All)
* 'parameter' - Model Hyperparameter
* 'lift' - Lift Curve
* 'gain' - Gain Chart
* 'tree' - Decision Tree
pyc.plot_model(
estimator,
plot = 'auc',
scale = 1,
save = False,
fold = None,
fit_kwargs = None,
groups = None,
use_train_data = False, # if true, train data will be used for not test
verbose = True,
)
pyc.evaluate_model(
estimator,
fold = None,
fit_kwargs = None,
groups = None,
use_train_data = False,
)
# AUC-ROC plot
pyc.plot_model(model_best, plot = 'auc')
# confusion matrix
pyc.plot_model(model_best, plot = 'confusion_matrix')
# evaluate model (click on buttons)
pyc.evaluate_model(model_best)
pyc.finalize_model(
estimator,
fit_kwargs = None,
group = None,
model_only = True,
)
Docstring:
This function trains a given estimator on the entire dataset including the
holdout set.
# pyc.finalize_model?
model_final = pyc.finalize_model(model_best)
print(model_final)
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=0.0001015188180155063, solver='eigen', store_covariance=False, tol=0.0001)
df_preds = pyc.predict_model(model_final,df_test)
df_preds.iloc[-5:,-5:]
PayMeth_monthCharges_diff | Tenure_cat | MonthlyCharges_cat | Label | Score | |
---|---|---|---|---|---|
1404 | 16.585135 | 0 | 0 | 1 | 0.6637 |
1405 | -19.764865 | 3 | 0 | 0 | 0.9429 |
1406 | 6.085135 | 1 | 0 | 0 | 0.5546 |
1407 | 30.735135 | 1 | 0 | 1 | 0.7503 |
1408 | -2.787658 | 1 | 0 | 0 | 0.6080 |
ytest = df_preds[target_name].to_numpy().ravel()
yprobs = df_preds['Score'].to_numpy().ravel()
ypreds = df_preds['Label'].to_numpy().ravel()
yprobs2d = np.c_[1-yprobs,yprobs]
pred_name = 'pycaret_lda'
path_pred = f'../predictions/{pred_name}.csv'
df_preds_out = pd.DataFrame({'customerID': ser_test_ids})
df_preds_out[f'ypreds_{pred_name}'] = ypreds
df_preds_out[f'yprobs_{pred_name}'] = yprobs2d[:,1]
df_preds_out.to_csv(path_pred,index=False)
df_preds_out.head()
customerID | ypreds_pycaret_lda | yprobs_pycaret_lda | |
---|---|---|---|
0 | 1794-HBQTJ | 0 | 0.6444 |
1 | 0356-OBMAC | 0 | 0.9264 |
2 | 4077-CROMM | 1 | 0.6536 |
3 | 5442-PPTJY | 0 | 0.9534 |
4 | 2333-KWEWW | 0 | 0.9574 |
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
model_eval_bin('pycaret_'+model_name,ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support 0 0.83 0.88 0.86 1035 1 0.61 0.51 0.56 374 accuracy 0.78 1409 macro avg 0.72 0.70 0.71 1409 weighted avg 0.77 0.78 0.78 1409 [[910 125] [182 192]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
pycaret_lda | 0.7821 | 0.6057 | 0.5134 | 0.5557 | 0.6963 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 10 min 2 secs