References
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# HPO for pycaret
!pip install tune-sklearn
!pip install optuna # hyperopt is already in colab
# gpu version of lightgbm for pycaret
!pip uninstall lightgbm -y
!pip install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so"
# regular pycaret without gpu
!pip install pycaret-nightly[full]
# ipywidget
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
from pycaret.utils import enable_colab
enable_colab()
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import joblib
from tqdm import tqdm_notebook as tqdm
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
# special
import pycaret
%load_ext watermark
%watermark -iv
joblib : 1.0.0 pandas : 1.1.5 numpy : 1.19.4 seaborn : 0.11.0 pycaret : 2.2.2 autopep8 : 1.5.4 sys : 3.8.5 (default, Sep 4 2020, 02:22:02) [Clang 10.0.0 ] json : 2.0.9 matplotlib: 3.2.2
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def compare_new_models(name,desc,mean_row,ofile,
df_eval=None,sort='Recall',show=True):
"""Create dataframe from output of pycaret new model.
Parameters
-----------
name: str
Name of the model. eg. xgboost
desc: str
Description of the model. e.g tuned,calibrated
mean_arr: np.ndarray
The mean row.
e.g.
df_res = pyc.pull()
mean_row = df_res.loc['Mean']
ofile: str
Output file name. e.g. 'pycaret_df_eval_lr.csv'
df_eval: Pandas Dataframe
Template pandas dataframe
sort: str
One of following string: Accuracy, AUC, Recall, Precision, F1, Kappa
Returns:
Pandas Dataframe.
"""
if not isinstance(df_eval, pd.DataFrame):
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'AUC':[],
'Recall':[],
'Precision':[],
'F1':[],
'Kappa':[],
'MCC': [],
'LogLoss': []
})
acc,auc,rec,pre,f1,kap,mcc,logloss = mean_row
row = [name,desc,acc,auc,rec,pre,f1,kap,mcc,logloss]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()\
.sort_values(sort,ascending=False)
df_eval.index = range(len(df_eval))
df_style = (df_eval.style.apply(lambda ser:
['background: tomato'
if ser.name == sort else ''
for _ in ser]))
if show:
display(df_style)
# save the data
df_eval.to_csv(ofile,index=False)
return df_eval
path_data_train = '../data/processed/train_cleaned.csv'
path_data_test = '../data/processed/test_cleaned.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/processed/train_cleaned.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/processed/test_cleaned.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 39) (1409, 39)
Gender | SeniorCitizen | Partner | Dependents | Tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | Contract_Month-to-month | NoSeniorCitizen_Contract_Month-to-month | PaymentMethod0_Contract_Month-to-month0 | InternetService_Fiber optic | StreamingTV_NoInternetService | No_OB_DP_TS | TotalServices | SenCit_Dependents | Partner_Dependents | SenCit_Partner | SenCit_Contract | SenCit_TechSupport | SenCit_PayMeth | Contract_mean_totCharges | Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | Tenure_cat | MonthlyCharges_cat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 36 | 1 | 2 | 1 | 2 | 2 | 2 | 2 | 0 | 2 | 2 | 1 | 1 | 106.05 | 3834.40 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 1 | 1 | 2 | 2 | 1 | 3683.643192 | 150.756808 | 66.703657 | 39.346343 | 3 | 0 |
1 | 1 | 0 | 0 | 0 | 10 | 1 | 0 | 0 | 2 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 1 | 62.25 | 612.95 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 1370.923131 | -757.973131 | 66.703657 | -4.453657 | 0 | 0 |
5632 | 0 | 0 | 1 | 1 | 68 | 1 | 2 | 1 | 0 | 2 | 0 | 2 | 2 | 2 | 2 | 1 | 1 | 103.75 | 7039.45 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 1 | 2 | 1 | 2 | 2 | 1 | 3683.643192 | 3355.806808 | 66.703657 | 37.046343 | 5 | 0 |
5633 | 1 | 0 | 0 | 0 | 69 | 1 | 2 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 0 | 1 | 23.95 | 1713.10 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 7 | 0 | 0 | 0 | 2 | 1 | 1 | 3683.643192 | -1970.543192 | 66.703657 | -42.753657 | 5 | 0 |
path_data_test_raw = ('https://raw.githubusercontent.com/'
'bhishanpdl/Datasets/master/Projects/'
'Telco_Customer_Churn/raw/test.csv')
df_test_raw1 = pd.read_csv(path_data_test_raw,usecols=['customerID'])
df_test_raw1.head(2)
customerID | |
---|---|
0 | 1794-HBQTJ |
1 | 0356-OBMAC |
ser_test_ids = df_test_raw1['customerID']
target_name = 'Churn'
# check for nans
df_train.isna().sum().sum()
0
import pycaret
import pycaret.classification as pyc
pyc.setup(
data: pandas.core.frame.DataFrame,
target: str,
train_size = 0.7,
test_data = None,
preprocess = True,
imputation_type = 'simple',
iterative_imputation_iters = 5,
categorical_features = None,
categorical_imputation = 'constant',
categorical_iterative_imputer = 'lightgbm',
ordinal_features = None,
high_cardinality_features = None,
high_cardinality_method = 'frequency',
numeric_features = None,
numeric_imputation = 'mean',
numeric_iterative_imputer = 'lightgbm',
date_features = None,
ignore_features = None,
normalize = False,
normalize_method = 'zscore',
transformation = False,
transformation_method = 'yeo-johnson',
handle_unknown_categorical = True,
unknown_categorical_method = 'least_frequent',
pca = False,
pca_method = 'linear',
pca_components = None,
ignore_low_variance = False,
combine_rare_levels = False,
rare_level_threshold = 0.1,
bin_numeric_features = None,
remove_outliers = False,
outliers_threshold = 0.05,
remove_multicollinearity = False,
multicollinearity_threshold = 0.9,
remove_perfect_collinearity = True,
create_clusters = False,
cluster_iter = 20,
polynomial_features = False,
polynomial_degree = 2,
trigonometry_features = False,
polynomial_threshold = 0.1,
group_features = None,
group_names = None,
feature_selection = False,
feature_selection_threshold = 0.8,
feature_selection_method = 'classic',
feature_interaction = False,
feature_ratio = False,
interaction_threshold = 0.01,
fix_imbalance = False,
fix_imbalance_method = None,
data_split_shuffle = True,
data_split_stratify = False,
fold_strategy = 'stratifiedkfold',
fold = 10,
fold_shuffle = False,
fold_groups = None,
n_jobs = -1,
use_gpu = False,
custom_pipeline = None,
html = True,
session_id = None,
log_experiment = False,
experiment_name = None,
log_plots = False,
log_profile = False,
log_data = False,
silent = False,
verbose = True,
profile = False,
profile_kwargs = None,
)
USE_GPU = False
if ENV_COLAB:
USE_GPU = True
# %%capture
exp = pyc.setup(df_train,target_name,
train_size=0.8,
session_id=SEED,
use_gpu=USE_GPU,
preprocess = True,
categorical_features = None,
ordinal_features = None,
high_cardinality_features = None,
numeric_features = None,
date_features = None,
ignore_features = None,
normalize = False,
data_split_stratify = True,
silent=True,
profile=False
)
# use silent = True to check inferred datatypes
# then assign numeric and categorical features yourself.
#
# if sampling = False, 100% of data is used and its too slow
# if sampling = True, we need to enter number eg. 0.3 ourself.
"""
Here, we have data < 25k rows, so I have chosen not to use sampling.
""";
Description | Value | |
---|---|---|
0 | session_id | 100 |
1 | Target | Churn |
2 | Target Type | Binary |
3 | Label Encoded | 0: 0, 1: 1 |
4 | Original Data | (5634, 39) |
5 | Missing Values | False |
6 | Numeric Features | 6 |
7 | Categorical Features | 32 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (4507, 60) |
12 | Transformed Test Set | (1127, 60) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | True |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | False |
20 | Experiment Name | clf-default-name |
21 | USI | 5b54 |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | mean |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | False |
30 | Normalize Method | None |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Clustering | False |
45 | Clustering Iteration | None |
46 | Polynomial Features | False |
47 | Polynomial Degree | None |
48 | Trignometry Features | False |
49 | Polynomial Threshold | None |
50 | Group Features | False |
51 | Feature Selection | False |
52 | Features Selection Threshold | None |
53 | Feature Interaction | False |
54 | Feature Ratio | False |
55 | Interaction Threshold | None |
56 | Fix Imbalance | False |
57 | Fix Imbalance Method | SMOTE |
pyc.models(internal=True)[['Name', 'GPU Enabled']]
# google colab does not support cuml and thereby sklearn models
# we need to run blazingsql notebooks (not colab) to use cuml models.
Name | GPU Enabled | |
---|---|---|
ID | ||
lr | Logistic Regression | False |
knn | K Neighbors Classifier | False |
nb | Naive Bayes | False |
dt | Decision Tree Classifier | False |
svm | SVM - Linear Kernel | False |
rbfsvm | SVM - Radial Kernel | False |
gpc | Gaussian Process Classifier | False |
mlp | MLP Classifier | False |
ridge | Ridge Classifier | False |
rf | Random Forest Classifier | False |
qda | Quadratic Discriminant Analysis | False |
ada | Ada Boost Classifier | False |
gbc | Gradient Boosting Classifier | False |
lda | Linear Discriminant Analysis | False |
et | Extra Trees Classifier | False |
xgboost | Extreme Gradient Boosting | False |
lightgbm | Light Gradient Boosting Machine | False |
catboost | CatBoost Classifier | False |
Bagging | Bagging Classifier | False |
Stacking | Stacking Classifier | False |
Voting | Voting Classifier | False |
CalibratedCV | Calibrated Classifier CV | False |
Comparing All Models
accuracy
pyc.compare_models(
include = None,
exclude = None,
fold = None,
round = 4,
cross_validation = True,
sort = 'Accuracy',
n_select = 1,
budget_time = None,
turbo = True,
errors = 'ignore',
fit_kwargs = None,
groups = None,
verbose = True,
)
# pyc.compare_models?
pyc.get_metrics().index
Index(['acc', 'auc', 'recall', 'precision', 'f1', 'kappa', 'mcc'], dtype='object', name='ID')
# add Log Loss metric in pycaret
from sklearn.metrics import log_loss
pyc.add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False)
Name LogLoss Display Name LogLoss Score Function <function log_loss at 0x7ffce25d8b80> Scorer make_scorer(log_loss, greater_is_better=False) Target pred Args {} Greater is Better False Multiclass True Custom True Name: logloss, dtype: object
Estimator Abbreviated String Original Implementation
--------- ------------------ -------------------------------
Logistic Regression 'lr' linear_model.LogisticRegression
K Nearest Neighbour 'knn' neighbors.KNeighborsClassifier
Naives Bayes 'nb' naive_bayes.GaussianNB
Decision Tree 'dt' tree.DecisionTreeClassifier
SVM (Linear) 'svm' linear_model.SGDClassifier
SVM (RBF) 'rbfsvm' svm.SVC
Gaussian Process 'gpc' gaussian_process.GPC
Multi Level Perceptron 'mlp' neural_network.MLPClassifier
Ridge Classifier 'ridge' linear_model.RidgeClassifier
Random Forest 'rf' ensemble.RandomForestClassifier
Quadratic Disc. Analysis 'qda' discriminant_analysis.QDA
AdaBoost 'ada' ensemble.AdaBoostClassifier
Gradient Boosting 'gbc' ensemble.GradientBoostingClassifier
Linear Disc. Analysis 'lda' discriminant_analysis.LDA
Extra Trees Classifier 'et' ensemble.ExtraTreesClassifier
Extreme Gradient Boosting 'xgboost' xgboost.readthedocs.io
Light Gradient Boosting 'lightgbm' github.com/microsoft/LightGBM
CatBoost Classifier 'catboost' https://catboost.ai
pyc.create_model(
estimator,
fold = None,
round = 4,
cross_validation = True,
fit_kwargs = None,
groups = None,
verbose = True,
**kwargs,
)
model_name = 'xgboost'
path_df_eval = 'pycaret_df_eval_xgb.csv'
model = pyc.create_model(model_name,verbose=False)
mean_row = pyc.pull().loc['Mean'].values
df_eval = compare_new_models(model_name,'default',
mean_row,path_df_eval,sort='Recall',df_eval=None)
Model | Description | Accuracy | AUC | Recall | Precision | F1 | Kappa | MCC | LogLoss | |
---|---|---|---|---|---|---|---|---|---|---|
0 | xgboost | default | 0.788600 | 0.826500 | 0.518500 | 0.621700 | 0.565100 | 0.427000 | 0.430300 | 7.303000 |
pyc.tune_model(
estimator,
fold = None,
round = 4,
n_iter = 10,
custom_grid = None,
optimize = 'Accuracy',
custom_scorer = None,
search_library = 'scikit-learn', # 'scikit-optimize', 'tune-sklearn','optuna'
search_algorithm = None, # 'scikit-learn', 'scikit-optimize', 'tune-sklearn', 'optuna'
early_stopping = False, # 'asha','hyperband','median'
early_stopping_max_iters = 10,
choose_better = False,
fit_kwargs = None,
groups = None,
return_tuner = False,
verbose = True,
tuner_verbose = True,
**kwargs,
)
# %%capture
# model_tuned1 = pyc.tune_model(model,n_iter=100,search_library='tune-sklearn',
# fold=5,optimize='Recall',verbose=False)
# mean_row = pyc.pull().loc['Mean']
# desc = 'tuned,tune-sklearn,n_iter=100'
# df_eval = compare_new_models(model_name,desc,mean_row,path_df_eval,
# sort='Recall',df_eval=df_eval)
# %%capture
# model_tuned2 = pyc.tune_model(model,n_iter=500,search_library='optuna',
# fold=5,optimize='Recall',verbose=False)
# mean_row = pyc.pull().loc['Mean']
# desc = 'tuned,optuna,n_iter=500'
# df_eval = compare_new_models(model_name,desc,mean_row,path_df_eval,
# sort='Recall',df_eval=df_eval)
# This is BAD, it gave me Recall = 1, too much overfitting
# everything is classified as not-churn
# USE AUC instead of Recall
# %%capture
# model_tuned_AUC = pyc.tune_model(model,n_iter=500,search_library='optuna',
# fold=5,optimize='AUC',verbose=False)
# mean_row = pyc.pull().loc['Mean']
# desc = 'tuned,optuna,n_iter=500,optimize=AUC'
# df_eval = compare_new_models(model_name,desc,mean_row,path_df_eval,sort='Recall',df_eval=df_eval)
# THIS GAVE ME WORSE RESULT THAN F1
# %%capture
# # This takes 1 hr on local computer, comment after HPO
# model_tuned_F1 = pyc.tune_model(model,n_iter=500,search_library='optuna',
# fold=5,optimize='F1',verbose=False)
# mean_row = pyc.pull().loc['Mean']
# desc = 'tuned,optuna,n_iter=500,optimize=F1'
# df_eval = compare_new_models(model_name,desc,mean_row,path_df_eval,
# sort='Recall',df_eval=df_eval)
# model_best = model_tuned_F1
odir = '.' if ENV_COLAB else '../models/'
path_model_best = odir+f'pycaret_model_best_{model_name}.joblib'
# joblib.dump(model_best, path_model_best) # comment this
model_best = joblib.load(path_model_best)
model_best
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5404689403971603, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.016761554573600634, max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=62, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=100, reg_alpha=6.0679925689388595e-09, reg_lambda=0.0012602601741511542, scale_pos_weight=2.0040690074466885, subsample=0.48109449185878156, tree_method='auto', use_label_encoder=True, validate_parameters=1, verbosity=0)
# model calibration is useless, we can do model evaluation.
Abbreviated String Name
------------------ -------
* 'auc' - Area Under the Curve
* 'threshold' - Discrimination Threshold
* 'pr' - Precision Recall Curve
* 'confusion_matrix' - Confusion Matrix
* 'error' - Class Prediction Error
* 'class_report' - Classification Report
* 'boundary' - Decision Boundary
* 'rfe' - Recursive Feature Selection
* 'learning' - Learning Curve
* 'manifold' - Manifold Learning
* 'calibration' - Calibration Curve
* 'vc' - Validation Curve
* 'dimension' - Dimension Learning
* 'feature' - Feature Importance
* 'feature_all' - Feature Importance (All)
* 'parameter' - Model Hyperparameter
* 'lift' - Lift Curve
* 'gain' - Gain Chart
* 'tree' - Decision Tree
pyc.plot_model(
estimator,
plot = 'auc',
scale = 1,
save = False,
fold = None,
fit_kwargs = None,
groups = None,
use_train_data = False, # if true, train data will be used for not test
verbose = True,
)
pyc.evaluate_model(
estimator,
fold = None,
fit_kwargs = None,
groups = None,
use_train_data = False,
)
# pyc.plot_model gives one by one plot, use
# pyc.evaluate_model to get all the plots.
# evaluate model (click on buttons)
pyc.evaluate_model(model_best)
pyc.interpret_model(
estimator,
plot = 'summary',
feature = None,
observation = None,
use_train_data = False,
**kwargs,
)
# interpret_model: SHAP
pyc.interpret_model(model_best)
# interpret model : Correlation
pyc.interpret_model(model_best,plot='correlation')
# interpret model : Reason
pyc.interpret_model(model_best,plot='reason',observation=12)
pyc.finalize_model(
estimator,
fit_kwargs = None,
group = None,
model_only = True,
)
Docstring:
This function trains a given estimator on the entire dataset including the
holdout set.
model_final = pyc.finalize_model(model_best)
print(model_final)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5404689403971603, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.016761554573600634, max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=62, n_jobs=-1, num_parallel_tree=1, objective='binary:logistic', random_state=100, reg_alpha=6.0679925689388595e-09, reg_lambda=0.0012602601741511542, scale_pos_weight=2.0040690074466885, subsample=0.48109449185878156, tree_method='auto', use_label_encoder=True, validate_parameters=1, verbosity=0)
df_test.iloc[-5:,-5:]
Contract_totCharges_diff | PayMeth_mean_monthCharges | PayMeth_monthCharges_diff | Tenure_cat | MonthlyCharges_cat | |
---|---|---|---|---|---|
1404 | -859.687095 | 63.914865 | 16.585135 | 0 | 0 |
1405 | -1152.769579 | 63.914865 | -19.764865 | 3 | 0 |
1406 | -218.037095 | 63.914865 | 6.085135 | 1 | 0 |
1407 | -77.487095 | 63.914865 | 30.735135 | 1 | 0 |
1408 | -367.987095 | 67.437658 | -2.787658 | 1 | 0 |
df_preds = pyc.predict_model(model_final,df_test)
df_preds.iloc[-5:,-5:]
PayMeth_monthCharges_diff | Tenure_cat | MonthlyCharges_cat | Label | Score | |
---|---|---|---|---|---|
1404 | 16.585135 | 0 | 0 | 1 | 0.6279 |
1405 | -19.764865 | 3 | 0 | 0 | 0.6988 |
1406 | 6.085135 | 1 | 0 | 1 | 0.6042 |
1407 | 30.735135 | 1 | 0 | 1 | 0.6533 |
1408 | -2.787658 | 1 | 0 | 0 | 0.6158 |
ytest = df_preds[target_name].to_numpy().ravel()
yprobs = df_preds['Score'].to_numpy().ravel()
ypreds = df_preds['Label'].to_numpy().ravel()
yprobs2d = np.c_[1-yprobs,yprobs]
pred_name = 'pycaret_xgboost'
path_pred = f'../predictions/{pred_name}.csv'
df_preds_out = pd.DataFrame({'customerID': ser_test_ids})
df_preds_out[f'ypreds_{pred_name}'] = ypreds
df_preds_out[f'yprobs_{pred_name}'] = yprobs2d[:,1]
df_preds_out.to_csv(path_pred,index=False)
df_preds_out.head()
customerID | ypreds_pycaret_xgboost | yprobs_pycaret_xgboost | |
---|---|---|---|
0 | 1794-HBQTJ | 1 | 0.5433 |
1 | 0356-OBMAC | 0 | 0.7239 |
2 | 4077-CROMM | 1 | 0.5501 |
3 | 5442-PPTJY | 0 | 0.8111 |
4 | 2333-KWEWW | 0 | 0.8104 |
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
model_eval_bin('pycaret_xgboost',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support 0 0.89 0.79 0.84 1035 1 0.56 0.74 0.64 374 accuracy 0.78 1409 macro avg 0.73 0.76 0.74 1409 weighted avg 0.80 0.78 0.78 1409 [[816 219] [ 97 277]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
pycaret_xgboost | 0.7757 | 0.5585 | 0.7406 | 0.6368 | 0.7645 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 33 secs