The datasets contains transactions made by credit cards in September 2013 by european cardholders.
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions.
The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
It contains only numerical input variables which are the result of a PCA transformation.
Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data.
Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'.
Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning.
Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.m
The term Boosting refers to a family of algorithms which converts weak learner to strong learners.
There are many boosting algorithms:
sklearn.ensemble.GradientBoostingRegressor
xgboost.XGBRegressor # fast and best
lightgbm.LGBMRegressor # extreme fast, little acc than xgb
catboost.CatBoostRegressor # good for categorical feats
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
#!pip install hpsklearn
!pip install shap eli5 lime scikit-plot watermark
!pip install optuna hyperopt
!pip install catboost
!pip install ipywidgets
!pip install -U scikit-learn
!jupyter nbextension enable --py widgetsnbextension
# create project like folders
!mkdir -p ../outputs ../images ../reports ../html ../models
print('Environment: Google Colab')
import time
notebook_start_time = time.time()
import numpy as np
import pandas as pd
pd.set_option('max_columns',100)
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# visualizatioin
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
# six and pickle
import six
import pickle
import joblib
# mixed
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)
# sklearn
import sklearn
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
# sklearn scalar metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# boosting
import xgboost, lightgbm, catboost
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier, DMatrix
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier, Pool, CatBoost
# parameters tuning
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
# model intepretation modules
import eli5
import shap
import yellowbrick
import lime
import scikitplot
# version
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2021-08-08 CPython 3.7.7 IPython 7.22.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit joblib 1.0.1 sklearn 0.23.1 seaborn 0.11.0 shap 0.39.0 xgboost 1.2.0 catboost 0.23.2 eli5 0.10.1 autopep8 1.5.2 six 1.15.0 lightgbm 2.3.1 yellowbrick 1.1 json 2.0.9 scikitplot 0.3.7 pandas 1.3.0 numpy 1.19.5
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
def model_evaluation(model_name, desc, ser_ytrain, trprobs1d,
df_eval=None,threshold=0.5,
show=True,col_sort='Recall'):
if df_eval is None:
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
'AUCPR':[],
})
ytr = np.array(ser_ytrain).flatten()
prec,rec,thr = sklearn.metrics.precision_recall_curve(ytr,trprobs1d)
auc_pr = sklearn.metrics.auc(rec,prec)
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
sklearn.metrics.accuracy_score(ytr, trpreds),
sklearn.metrics.precision_score(ytr, trpreds, average=average),
sklearn.metrics.recall_score(ytr, trpreds, average=average),
sklearn.metrics.f1_score(ytr, trpreds, average=average),
sklearn.metrics.roc_auc_score(ytr, trprobs1d), # auc need probs
auc_pr
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
df_eval = df_eval.sort_values(col_sort,ascending=False)
if show:
display(df_eval.style.background_gradient(subset=[col_sort]))
return df_eval
df_eval = None
ifile = 'https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()
(284807, 31)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
target = 'Class'
features = df.columns.drop(target)
df[target].value_counts(normalize=True)*100
0 99.827251 1 0.172749 Name: Class, dtype: float64
sns.countplot(x=df[target])
<matplotlib.axes._subplots.AxesSubplot at 0x7fb9ed422e50>
from sklearn.model_selection import train_test_split
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1),
df[target],
test_size=0.2,
random_state=SEED,
stratify=df[target])
ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
print(df_Xtrain_orig.shape)
df_Xtrain_orig.head()
(227845, 30)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
36001 | 38355.0 | 1.043949 | 0.318555 | 1.045810 | 2.805989 | -0.561113 | -0.367956 | 0.032736 | -0.042333 | -0.322674 | ... | -0.084556 | -0.240105 | -0.680315 | 0.085328 | 0.684812 | 0.318620 | -0.204963 | 0.001662 | 0.037894 | 49.67 |
12844 | 22555.0 | -1.665159 | 0.808440 | 1.805627 | 1.903416 | -0.821627 | 0.934790 | -0.824802 | 0.975890 | 1.747469 | ... | -0.373759 | -0.335332 | -0.510994 | 0.035839 | 0.147565 | -0.529358 | -0.566950 | -0.595998 | -0.220086 | 16.94 |
2873 | 2431.0 | -0.324096 | 0.601836 | 0.865329 | -2.138000 | 0.294663 | -1.251553 | 1.072114 | -0.334896 | 1.071268 | ... | -0.039868 | 0.012220 | 0.352856 | -0.341505 | -0.145791 | 0.094194 | -0.804026 | 0.229428 | -0.021623 | 1.00 |
145263 | 86773.0 | -0.258270 | 1.217501 | -0.585348 | -0.875347 | 1.222481 | -0.311027 | 1.073860 | -0.161408 | 0.200665 | ... | 0.382305 | -0.424626 | -0.781158 | 0.019316 | 0.178614 | -0.315616 | 0.096665 | 0.269740 | -0.020635 | 10.78 |
186658 | 127202.0 | 2.142162 | -0.494988 | -1.936511 | -0.818288 | -0.025213 | -1.027245 | -0.151627 | -0.305750 | -0.869482 | ... | 0.106592 | 0.010115 | 0.021722 | 0.079463 | -0.480899 | 0.023846 | -0.279076 | -0.030121 | -0.043888 | 39.96 |
5 rows × 30 columns
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig,
ser_ytrain_orig,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_orig)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(df_Xtrain.shape)
(182276, 30)
https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
class CatBoostRegressor(
iterations=None, learning_rate=None,
depth=None, l2_leaf_reg=None,
model_size_reg=None, rsm=None,
loss_function='RMSE', border_count=None,
feature_border_type=None, per_float_feature_quantization=None,
input_borders=None, output_borders=None,
fold_permutation_block=None, od_pval=None,
od_wait=None, od_type=None,
nan_mode=None, counter_calc_method=None,
leaf_estimation_iterations=None, leaf_estimation_method=None,
thread_count=None, random_seed=None,
use_best_model=None, best_model_min_trees=None,
verbose=None, silent=None,
logging_level=None, metric_period=None,
ctr_leaf_count_limit=None, store_all_simple_ctr=None,
max_ctr_complexity=None, has_time=None,
allow_const_label=None, one_hot_max_size=None,
random_strength=None,name=None, ignored_features=None,
train_dir=None, custom_metric=None,
eval_metric=None, bagging_temperature=None,
save_snapshot=None, snapshot_file=None,
snapshot_interval=None, fold_len_multiplier=None,
used_ram_limit=None, gpu_ram_part=None,
pinned_memory_size=None, allow_writing_files=None,
final_ctr_computation_mode=None, approx_on_full_history=None,
boosting_type=None, simple_ctr=None,
combinations_ctr=None, per_feature_ctr=None,
ctr_target_border_count=None, task_type=None,
device_config=None, devices=None,
bootstrap_type=None, subsample=None,
sampling_unit=None, dev_score_calc_obj_block_size=None,
max_depth=None, n_estimators=None,
num_boost_round=None, num_trees=None,
colsample_bylevel=None, random_state=None,
reg_lambda=None, objective=None,
eta=None, max_bin=None,
gpu_cat_features_storage=None, data_partition=None,
metadata=None, early_stopping_rounds=None,
cat_features=None, grow_policy=None,
min_data_in_leaf=None, min_child_samples=None,
max_leaves=None, num_leaves=None,
score_function=None, leaf_estimation_backtracking=None,
ctr_history_unit=None, monotone_constraints=None
)
import catboost
bp.show_methods(catboost,2)
0 | 1 | |
---|---|---|
0 | CatBoost | Pool |
1 | CatBoostClassifier | core |
2 | CatBoostError | cv |
3 | CatBoostRegressor | sum_models |
4 | CatboostError | to_classifier |
5 | EFstrType | to_regressor |
6 | FeaturesData | train |
7 | MetricVisualizer | version |
8 | MultiRegressionCustomMetric | widget |
9 | MultiRegressionCustomObjective |
from catboost import CatBoostClassifier, Pool
bp.show_methods(CatBoostClassifier,2)
0 | 1 | |
---|---|---|
0 | best_iteration_ | get_test_evals |
1 | best_score_ | get_text_feature_indices |
2 | calc_feature_statistics | get_tree_leaf_counts |
3 | calc_leaf_indexes | grid_search |
4 | classes_ | is_fitted |
5 | compare | iterate_leaf_indexes |
6 | copy | learning_rate_ |
7 | create_metric_calcer | load_model |
8 | drop_unused_features | plot_partial_dependence |
9 | eval_metrics | plot_predictions |
10 | evals_result_ | plot_tree |
11 | feature_importances_ | predict |
12 | feature_names_ | predict_log_proba |
13 | fit | predict_proba |
14 | get_all_params | random_seed_ |
15 | get_best_iteration | randomized_search |
16 | get_best_score | save_borders |
17 | get_borders | save_model |
18 | get_cat_feature_indices | score |
19 | get_evals_result | set_feature_names |
20 | get_feature_importance | set_leaf_values |
21 | get_leaf_values | set_params |
22 | get_leaf_weights | set_scale_and_bias |
23 | get_metadata | shrink |
24 | get_object_importance | staged_predict |
25 | get_param | staged_predict_log_proba |
26 | get_params | staged_predict_proba |
27 | get_scale_and_bias | tree_count_ |
28 | get_test_eval |
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn import model_selection
# time
time_start = time.time()
# current parameters
desc = 'default,random_state=100, cross_validation_ypreds'
Xtr = df_Xtrain.to_numpy()
ytr = ser_ytrain.to_numpy().ravel()
Xtx = df_Xtest.to_numpy()
ytx = ser_ytest.to_numpy().ravel()
# fit the model
model = CatBoostClassifier(verbose=100,random_state=SEED)
model.fit(Xtr, ytr)
# save the model
# joblib.dump(model_cat, 'model_cat.pkl')
# model_cat = joblib.load('model_cat.pkl')
# predictions
skf = model_selection.StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
trprobs_cv = model_selection.cross_val_predict(model, df_Xtrain, ser_ytrain,
cv=skf,method='predict_proba')
trprobs1d = trprobs_cv[:,1] # take 2nd column for probability
threshold = 0.5
trpreds = (trprobs1d>threshold).astype(np.int8)
# model evaluation
df_eval = model_evaluation('catboost', desc, ytr,trprobs1d,df_eval=df_eval)
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
Learning rate set to 0.095119 0: learn: 0.3780861 total: 124ms remaining: 2m 4s 100: learn: 0.0016198 total: 5.19s remaining: 46.2s 200: learn: 0.0010673 total: 10.3s remaining: 40.9s 300: learn: 0.0008012 total: 15.5s remaining: 35.9s 400: learn: 0.0005683 total: 20.5s remaining: 30.6s 500: learn: 0.0003989 total: 25.6s remaining: 25.5s 600: learn: 0.0002876 total: 30.8s remaining: 20.4s 700: learn: 0.0002030 total: 35.8s remaining: 15.3s 800: learn: 0.0001605 total: 40.8s remaining: 10.1s 900: learn: 0.0001330 total: 46.1s remaining: 5.06s 999: learn: 0.0001102 total: 51s remaining: 0us Learning rate set to 0.07075 0: learn: 0.4576720 total: 63.9ms remaining: 1m 3s 100: learn: 0.0014048 total: 3.19s remaining: 28.4s 200: learn: 0.0007900 total: 6.25s remaining: 24.9s 300: learn: 0.0004575 total: 9.32s remaining: 21.6s 400: learn: 0.0002501 total: 12.4s remaining: 18.5s 500: learn: 0.0001718 total: 15.5s remaining: 15.4s 600: learn: 0.0001308 total: 18.5s remaining: 12.3s 700: learn: 0.0001045 total: 21.7s remaining: 9.26s 800: learn: 0.0000867 total: 24.8s remaining: 6.15s 900: learn: 0.0000768 total: 27.8s remaining: 3.05s 999: learn: 0.0000661 total: 30.8s remaining: 0us Learning rate set to 0.07075 0: learn: 0.4631036 total: 33.1ms remaining: 33s 100: learn: 0.0018651 total: 3.08s remaining: 27.4s 200: learn: 0.0011770 total: 6.14s remaining: 24.4s 300: learn: 0.0008368 total: 9.23s remaining: 21.4s 400: learn: 0.0005437 total: 13s remaining: 19.4s 500: learn: 0.0003737 total: 16s remaining: 16s 600: learn: 0.0002749 total: 19.1s remaining: 12.7s 700: learn: 0.0002123 total: 22.5s remaining: 9.6s 800: learn: 0.0001767 total: 25.6s remaining: 6.35s 900: learn: 0.0001466 total: 28.6s remaining: 3.14s 999: learn: 0.0001254 total: 31.6s remaining: 0us
Model | Description | Accuracy | Precision | Recall | F1 | AUC | AUCPR | |
---|---|---|---|---|---|---|---|---|
0 | catboost | default,random_state=100, cross_validation_ypreds | 0.999534 | 0.945736 | 0.774603 | 0.851658 | 0.965710 | 0.827831 |
Time taken: 1 min 55 secs
Model | Description | Accuracy | Precision | Recall | F1 | AUC | AUCPR | |
---|---|---|---|---|---|---|---|---|
0 | catboost | default,random_state=100, cross_validation_ypreds | 0.999534 | 0.945736 | 0.774603 | 0.851658 | 0.96571 | 0.827831 |
%%time
model = CatBoostClassifier(verbose=100,random_state=SEED)
model.fit(Xtr, ytr)
joblib.dump(model, '../models/model_cat_default_seed100.joblib')
ypreds = model.predict(Xtx)
cm = sklearn.metrics.confusion_matrix(ytx,ypreds)
print('confusion matrix\n',cm)
Learning rate set to 0.095119 0: learn: 0.3780861 total: 62.7ms remaining: 1m 2s 100: learn: 0.0016198 total: 5.1s remaining: 45.4s 200: learn: 0.0010673 total: 10.1s remaining: 40.2s 300: learn: 0.0008012 total: 15.3s remaining: 35.5s 400: learn: 0.0005683 total: 21.8s remaining: 32.6s 500: learn: 0.0003989 total: 30.9s remaining: 30.8s 600: learn: 0.0002876 total: 35.9s remaining: 23.8s 700: learn: 0.0002030 total: 40.8s remaining: 17.4s 800: learn: 0.0001605 total: 45.7s remaining: 11.4s 900: learn: 0.0001330 total: 51s remaining: 5.6s 999: learn: 0.0001102 total: 58s remaining: 0us confusion matrix [[56859 5] [ 24 74]] CPU times: user 1min 56s, sys: 16 s, total: 2min 12s Wall time: 58.7 s
yprobs = model.predict_proba(Xtx)
print(yprobs[:5])
[[9.99998192e-01 1.80796544e-06] [9.99932577e-01 6.74226697e-05] [9.99998670e-01 1.32969613e-06] [9.99996711e-01 3.28938804e-06] [9.99994773e-01 5.22686104e-06]]
from scikitplot import metrics as skpmetrics
skpmetrics.plot_confusion_matrix(ytx, ypreds)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb9ede02c50>
fig, ax = plt.subplots(figsize=(12,8))
skpmetrics.plot_roc(ytx,yprobs,ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb9ee789610>
import eli5
# eli5.explain_weights_catboost(model) # same thing
eli5.show_weights(model)
Weight | Feature |
---|---|
0.0787 | 1 |
0.0637 | 8 |
0.0607 | 14 |
0.0535 | 4 |
0.0519 | 26 |
0.0456 | 29 |
0.0428 | 0 |
0.0391 | 12 |
0.0390 | 22 |
0.0384 | 17 |
0.0382 | 3 |
0.0336 | 13 |
0.0320 | 20 |
0.0284 | 15 |
0.0283 | 25 |
0.0283 | 18 |
0.0276 | 2 |
0.0271 | 10 |
0.0258 | 24 |
0.0245 | 5 |
… 10 more … |
df_Xtrain.head(2)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
138257 | 82565.0 | 1.118591 | 0.562709 | 0.569628 | 2.987496 | -0.365594 | -0.531789 | -0.044144 | 0.011932 | -0.129131 | ... | -0.204184 | -0.128269 | -0.218875 | -0.048816 | 0.617265 | 0.551384 | 0.06022 | 0.016136 | 0.047100 | 7.6 |
60033 | 49125.0 | 1.170686 | 0.083759 | 0.466278 | 0.913911 | -0.093123 | 0.427588 | -0.372727 | 0.312777 | 0.129610 | ... | -0.226078 | -0.176121 | -0.584726 | 0.066051 | -0.746667 | 0.232641 | -0.54774 | 0.038060 | 0.010995 | 3.9 |
2 rows × 30 columns
# # time
# time_start = time.time()
# # current parameters
# Xtr = df_Xtrain
# ytr = ser_ytrain.to_numpy().ravel()
# Xtx = df_Xtest
# ytx = ser_ytest.to_numpy().ravel()
# Xvd = df_Xvalid
# yvd = ser_yvalid.to_numpy().ravel()
# # fit the model
# model = CatBoostClassifier(random_state=0,verbose=100)
# model.fit(Xtr, ytr,
# eval_set=(Xvd, yvd))
# # ypreds
# ypreds = model.predict(Xtx)
# # r-squared values
# auc = roc_auc_score(ytx, ypreds)
# # time
# time_taken = time.time() - time_start
# print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
# print('ROC AUC Score ', auc)
catboost tutorials model analysis feature statistics tutorial
df_Xtrain.head(2)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
138257 | 82565.0 | 1.118591 | 0.562709 | 0.569628 | 2.987496 | -0.365594 | -0.531789 | -0.044144 | 0.011932 | -0.129131 | 0.084908 | -0.540334 | -0.405535 | -1.124493 | -1.118716 | -0.708344 | 0.403485 | 0.890145 | -0.025385 | -0.901995 | -0.204184 | -0.128269 | -0.218875 | -0.048816 | 0.617265 | 0.551384 | 0.06022 | 0.016136 | 0.047100 | 7.6 |
60033 | 49125.0 | 1.170686 | 0.083759 | 0.466278 | 0.913911 | -0.093123 | 0.427588 | -0.372727 | 0.312777 | 0.129610 | 0.188107 | 0.707980 | 0.025212 | -1.341491 | 0.695831 | 0.890826 | 0.588590 | -0.703943 | 0.247763 | -0.221215 | -0.226078 | -0.176121 | -0.584726 | 0.066051 | -0.746667 | 0.232641 | -0.54774 | 0.038060 | 0.010995 | 3.9 |
model = CatBoostClassifier(verbose=100,random_state=SEED)
model.fit(Xtr, ytr)
# float feature
feature_name = 'Amount'
dict_stats = model.calc_feature_statistics(df_Xtrain, ser_ytrain, feature_name, plot=True)
Learning rate set to 0.095119 0: learn: 0.3780861 total: 55.2ms remaining: 55.1s 100: learn: 0.0016198 total: 5.37s remaining: 47.8s 200: learn: 0.0010673 total: 10.5s remaining: 41.7s 300: learn: 0.0008012 total: 15.7s remaining: 36.4s 400: learn: 0.0005683 total: 20.8s remaining: 31s 500: learn: 0.0003989 total: 25.8s remaining: 25.7s 600: learn: 0.0002876 total: 35.2s remaining: 23.4s 700: learn: 0.0002030 total: 40.5s remaining: 17.3s 800: learn: 0.0001605 total: 47.2s remaining: 11.7s 900: learn: 0.0001330 total: 59.5s remaining: 6.54s 999: learn: 0.0001102 total: 1m 6s remaining: 0us
--------------------------------------------------------------------------- CatBoostError Traceback (most recent call last) <ipython-input-26-58d6b98a903e> in <module> 4 # float feature 5 feature_name = 'Amount' ----> 6 dict_stats = model.calc_feature_statistics(df_Xtrain, ser_ytrain, feature_name, plot=True) ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/catboost/core.py in calc_feature_statistics(self, data, target, feature, prediction_type, cat_feature_values, plot, max_cat_features_on_plot, thread_count, plot_file) 3004 if not isinstance(feature, int): 3005 if self.feature_names_ is None or feature not in self.feature_names_: -> 3006 raise CatBoostError('No feature named "{}" in model'.format(feature)) 3007 feature_num = self.feature_names_.index(feature) 3008 else: CatBoostError: No feature named "Amount" in model
# feature importance
df_imp = pd.DataFrame({'Feature': features,
'Importance': model.feature_importances_
})
df_imp.sort_values('Importance',ascending=False).style.background_gradient()
def plot_feature_imp_catboost(model_catboost,features):
"""Plot the feature importance horizontal bar plot.
"""
df_imp = pd.DataFrame({'Feature': model.feature_names_,
'Importance': model.feature_importances_
})
df_imp = df_imp.sort_values('Importance').set_index('Feature')
ax = df_imp.plot.barh(figsize=(12,8))
plt.grid(True)
plt.title('Feature Importance',fontsize=14)
ax.get_legend().remove()
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo')
plt.show()
plot_feature_imp_catboost(model, features)
df_fimp = model.get_feature_importance(prettified=True)
df_fimp.head()
plt.figure(figsize=(12,8))
ax = sns.barplot(x=df_fimp.columns[1], y=df_fimp.columns[0], data=df_fimp);
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo',va='top',ha='left')
from catboost import CatBoost, Pool
# help(CatBoost)
cat_features = [] # take it empty for the moment
dtrain = Pool(df_Xtrain, ser_ytrain, cat_features=cat_features)
dvalid = Pool(df_Xvalid, ser_yvalid, cat_features=cat_features)
dtest = Pool(df_Xtest, ser_ytest, cat_features=cat_features)
params_cat = {'iterations': 100, 'verbose': False,
'random_seed': 0,
'eval_metric':'AUC',
'loss_function':'Logloss',
'cat_features': [],
'ignored_features': [],
'early_stopping_rounds': 200,
'verbose': 200,
}
bst_cat = CatBoost(params=params_cat)
bst_cat.fit(dtrain,
eval_set=(df_Xvalid, ser_yvalid),
use_best_model=True,
plot=True);
print(bst_cat.eval_metrics(dtest, ['AUC'])['AUC'][-1])
cv(pool=None, params=None, dtrain=None, iterations=None,
num_boost_round=None, fold_count=None, nfold=None, inverted=False,
partition_random_seed=0, seed=None, shuffle=True, logging_level=None,
stratified=None, as_pandas=True, metric_period=None, verbose=None,
verbose_eval=None, plot=False, early_stopping_rounds=None,
save_snapshot=None, snapshot_file=None,
snapshot_interval=None, folds=None, type='Classical')
params = {'iterations': 100, 'verbose': False,
'random_seed': 0,
'loss_function':'Logloss',
'eval_metric':'AUC',
}
df_scores = catboost.cv(dtrain,
params,
fold_count=2,
verbose=100,
shuffle=True,
stratified=True,
plot="True") # plot does not work in google colab
print(df_scores.columns)
df_scores.head()
sns.lineplot(x='iterations',y='train-Logloss-mean',data=df_scores,ax=ax,color='r')
sns.lineplot(x='iterations',y='test-Logloss-mean',data=df_scores,ax=ax,
color='b',alpha=0.2,linewidth=5,linestyle='--')
plt.show()
We generally should optimize model complexity and then tune the convergence.
model complexity: max_depth etc convergence: learning rate
Parameters:
model = joblib.load('../models/model_cat_default_seed100.joblib')
ypreds = model.predict(df_Xtest)
cm = confusion_matrix(ytest, ypreds)
print(cm)
%%time
params = dict(verbose=500,
random_state=0,
iterations=3_000,
eval_metric='AUC',
cat_features = [],
early_stopping_rounds=200,
)
model = catboost.CatBoostClassifier(**params)
model.fit(df_Xtrain, ytrain,
eval_set=(df_Xvalid, yvalid),
use_best_model=True,
plot=False
);
# now use the best iteration
best_iter = model.get_best_iteration()
model = CatBoostClassifier(verbose=False,random_state=0,iterations=best_iter)
model.fit(df_Xtrain, ser_ytrain)
joblib.dump(model, '../models/model_cat_earlystopping.joblib')
ypreds = model.predict(df_Xtest)
cm = confusion_matrix(ytest, ypreds)
print(cm)
desc = f'early stopping, iterations={best_iter}'
df_eval = model_evaluation('catboost', desc, ytx,ypreds,df_eval=df_eval)
# using best iterations is worse, use previous 1000.
# for n in [6]: # default detpth = 6
# model = CatBoostClassifier(verbose=False,random_state=0,
# iterations=1_000,
# depth=n,
# )
# model.fit(Xtr, ytr)
# ypreds = model.predict(Xtx)
# cm = confusion_matrix(ytest, ypreds)
# error = cm[0,1] + cm[1,0]
# print(f'Confusion matrix error count = {error} for n = {n}')
# for n in [0]:
# model = CatBoostClassifier(verbose=False,random_state=n,
# depth=6,
# iterations=1_000,
# )
# model.fit(Xtr, ytr)
# ypreds = model.predict(Xtx)
# cm = confusion_matrix(ytest, ypreds)
# error = cm[0,1] + cm[1,0]
# print(f'Confusion matrix error count = {error} for n = {n}')
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress
def objective(trial):
params_cat_optuna = {
'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.01, 0.1),
'depth': trial.suggest_int('depth', 1, 12),
'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
'bootstrap_type': trial.suggest_categorical('bootstrap_type',
['Bayesian', 'Bernoulli', 'MVS']),
'used_ram_limit': '3gb'
}
# update parameters
if params_cat_optuna['bootstrap_type'] == 'Bayesian':
params_cat_optuna['bagging_temperature'] = trial.suggest_uniform('bagging_temperature', 0, 10)
elif params_cat_optuna['bootstrap_type'] == 'Bernoulli':
params_cat_optuna['subsample'] = trial.suggest_uniform('subsample', 0.1, 1)
# fit the model
model = CatBoostClassifier(random_state=SEED,**params_cat_optuna)
model.fit(df_Xtrain, ser_ytrain,
eval_set=[(df_Xvalid, ser_yvalid)],
verbose=0,
early_stopping_rounds=100)
ypreds = model.predict(df_Xvalid)
ypreds = np.rint(ypreds)
score = roc_auc_score(ser_yvalid.to_numpy().ravel(),
ypreds)
return score
# NOTE: there is inherent non-determinism in optuna hyperparameter selection
# we may not get the same hyperparameters when run twice.
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='cat_optuna',
storage='sqlite:///cat_optuna_fraud_detection.db',
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS,timeout=600)
# Resume from last time
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='cat_optuna',
storage='sqlite:///cat_optuna_fraud_detection.db',
load_if_exists=True)
# study.optimize(objective, n_trials=N_TRIALS)
print(f'Number of finished trials: {len(study.trials)}')
# best trail
best_trial = study.best_trial
# best params
params_best = study.best_trial.params
params_best
%%time
model_name = 'catboost'
desc = 'grid search optuna'
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()
# use best model
params_best = study.best_trial.params
clf = CatBoostClassifier(random_state=SEED,verbose=False)
clf.set_params(**params_best)
# fit and save the model
clf.fit(Xtr, ytr)
joblib.dump(clf,'../models/clf_cat_grid_search_optuna.pkl')
# load the saved model
clf = joblib.load('../models/clf_cat_grid_search_optuna.pkl')
# predictions
ypreds = clf.predict(Xtx)
# model evaluation
cm = confusion_matrix(ytx, ypreds)
print(cm)
desc = f'grid search optuna'
df_eval = model_evaluation('catboost', desc, ytx,ypreds,df_eval=df_eval)
%%time
model = CatBoostClassifier(verbose=False,random_state=100,
depth=6,
iterations=1_000,
)
model.fit(Xtr, ytr)
joblib.dump(model, '../models/model_cat_best.joblib')
ypreds = model.predict(Xtx)
cm = confusion_matrix(ytest, ypreds)
print(cm)
df_eval = model_evaluation('catboost', 'seed=100,depth=6,iter=1k', ytest, ypreds,df_eval=df_eval)
df_Xtrain.head(2).append(df_Xtest.head(2))
import eli5
eli5.show_weights(model)
from eli5.sklearn import PermutationImportance
feature_names = df_Xtrain.columns.tolist()
perm = PermutationImportance(model).fit(df_Xtest, ytx)
eli5.show_weights(perm, feature_names=feature_names)
import lime
import lime.lime_tabular
idx = 0
example = df_Xtest.iloc[idx]
answer = ser_ytest.iloc[idx]
feature_names = df_Xtest.columns.tolist()
prediction = model.predict(example.to_numpy().reshape(-1,1).T)
print(f'answer = {answer}')
print('prediction = ', prediction[0])
print()
print(example)
print(feature_names)
import lime
import lime.lime_tabular
categorical_features = []
categorical_features_idx = [df_Xtrain.columns.get_loc(col) for col in categorical_features]
explainer = lime.lime_tabular.LimeTabularExplainer(df_Xtrain.to_numpy(),
feature_names=feature_names,
class_names=['Not-fraud','Fraud'],
categorical_features=categorical_features_idx,
mode='classification')
exp = explainer.explain_instance(example, model.predict_proba, num_features=8)
exp.show_in_notebook(show_table=True)
exp.as_pyplot_figure(); # use semicolon
import shap
shap.initjs()
# model = CatBoostClassifier(verbose=100,random_state=100)
# model.fit(df_Xtrain, ytrain)
model = joblib.load('../models/model_cat_best.joblib')
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
df_Xtest.head(1)
df_Xtest.head(1)['V15 V18 V3 V24 V1 V8 V4 V14 V2 V6 V9 V20'.split()].round(4)
# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
shap_values[idx,:],
df_Xtest.iloc[idx,:],
matplotlib=False,
text_rotation=90)
# for this row, the predicted label is -9.33
# red features makes it higher
# blue features makes it smaller.
NUM = 100
shap.force_plot(explainer.expected_value, shap_values[:NUM,:],
df_Xtest.iloc[:NUM,:],matplotlib=False)
shap.summary_plot(shap_values, df_Xtest)
shap.summary_plot(shap_values, df_Xtest, plot_type='bar')
shap.dependence_plot("Amount", shap_values, df_Xtest)
shap.dependence_plot(ind='Time', interaction_index='Amount',
shap_values=shap_values,
features=df_Xtest,
display_features=df_Xtest)
notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))