The datasets contains transactions made by credit cards in September 2013 by european cardholders.
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions.
The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
It contains only numerical input variables which are the result of a PCA transformation.
Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data.
Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'.
Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning.
Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.m
The term Boosting refers to a family of algorithms which converts weak learner to strong learners.
There are many boosting algorithms:
sklearn.ensemble.GradientBoostingRegressor
xgboost.XGBRegressor # fast and best
lightgbm.LGBMRegressor # extreme fast, little acc than xgb
catboost.CatBoostRegressor # good for categorical feats
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
#!pip install hpsklearn
!pip install shap eli5 lime scikit-plot watermark
!pip install optuna hyperopt
!pip install catboost
!pip install ipywidgets
!pip install -U scikit-learn
!jupyter nbextension enable --py widgetsnbextension
# create project like folders
!mkdir -p ../outputs ../images ../reports ../html ../models
print('Environment: Google Colab')
import time
notebook_start_time = time.time()
import numpy as np
import pandas as pd
pd.set_option('max_columns',100)
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# visualizatioin
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
# six and pickle
import six
import pickle
import joblib
# mixed
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)
# sklearn
import sklearn
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
# sklearn scalar metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# boosting
import xgboost, lightgbm, catboost
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier, DMatrix
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier, Pool, CatBoost
# parameters tuning
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
# model intepretation modules
import eli5
import shap
import yellowbrick
import lime
import scikitplot
# version
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.metrics.scorer module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API. warnings.warn(message, FutureWarning) /Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.feature_selection.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API. warnings.warn(message, FutureWarning)
Bhishan Poudel 2021-08-09 CPython 3.7.7 IPython 7.22.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit six 1.15.0 sklearn 0.23.1 shap 0.39.0 yellowbrick 1.1 joblib 1.0.1 eli5 0.10.1 lightgbm 2.3.1 catboost 0.23.2 scikitplot 0.3.7 xgboost 1.2.0 numpy 1.19.5 seaborn 0.11.0 autopep8 1.5.2 json 2.0.9 pandas 1.3.0
The sklearn.metrics.classification module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
def get_profit(y_true, y_pred):
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
# scoring = sklearn.metrics.make_scorer(get_profit, greater_is_better=True)
#=========== for catboost
class ProfitMetric:
@staticmethod
def get_profit(y_true, y_pred):
from scipy.special import expit
y_pred = expit(y_pred).astype(int)
y_true = y_true.astype(int)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
def is_max_optimal(self):
return True # greater is better
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
y_true = np.array(target).astype(int)
approx = approxes[0]
score = self.get_profit(y_true, approx)
return score, 1
def get_final_error(self, error, weight):
return error
# model = CatBoostClassifier(metric_period=50,
# n_estimators=200,
# eval_metric=ProfitMetric()
# )
# model.fit(X, y, eval_set=(X_test, y_test))
def model_evaluation(model_name, desc, ser_ytest, yprobs1d,
df_eval=None,threshold=0.5,
show=True,col_sort='Recall'):
if df_eval is None:
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
'AUCPR':[],
})
y_true = np.array(ser_ytest).flatten()
prec,rec,thr = sklearn.metrics.precision_recall_curve(y_true,yprobs1d)
auc_pr = sklearn.metrics.auc(rec,prec)
y_pred = (yprobs1d > threshold).astype(np.int8)
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
sklearn.metrics.accuracy_score(y_true, y_pred),
sklearn.metrics.precision_score(y_true, y_pred, average=average),
sklearn.metrics.recall_score(y_true, y_pred, average=average),
sklearn.metrics.f1_score(y_true, y_pred, average=average),
sklearn.metrics.roc_auc_score(y_true, yprobs1d), # auc need probs
auc_pr
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
df_eval = df_eval.sort_values(col_sort,ascending=False)
if show:
display(df_eval.style.background_gradient(subset=[col_sort]))
return df_eval
df_eval = None
ifile = 'https://github.com/bhishanpdl/Datasets/blob/master/Projects/Fraud_detection/raw/creditcard.csv.zip?raw=true'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()
(284807, 31)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | -0.551600 | -0.617801 | -0.991390 | -0.311169 | 1.468177 | -0.470401 | 0.207971 | 0.025791 | 0.403993 | 0.251412 | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | 1.612727 | 1.065235 | 0.489095 | -0.143772 | 0.635558 | 0.463917 | -0.114805 | -0.183361 | -0.145783 | -0.069083 | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | 0.207643 | 0.624501 | 0.066084 | 0.717293 | -0.165946 | 2.345865 | -2.890083 | 1.109969 | -0.121359 | -2.261857 | 0.524980 | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | -0.054952 | -0.226487 | 0.178228 | 0.507757 | -0.287924 | -0.631418 | -1.059647 | -0.684093 | 1.965775 | -1.232622 | -0.208038 | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | 0.753074 | -0.822843 | 0.538196 | 1.345852 | -1.119670 | 0.175121 | -0.451449 | -0.237033 | -0.038195 | 0.803487 | 0.408542 | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
target = 'Class'
features = df.columns.drop(target)
df[target].value_counts(normalize=True)*100
0 99.827251 1 0.172749 Name: Class, dtype: float64
sns.countplot(x=df[target])
<matplotlib.axes._subplots.AxesSubplot at 0x7f93e2f94550>
from sklearn.model_selection import train_test_split
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1),
df[target],
test_size=0.2,
random_state=SEED,
stratify=df[target])
ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
print(df_Xtrain_orig.shape)
df_Xtrain_orig.head()
(227845, 30)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
36001 | 38355.0 | 1.043949 | 0.318555 | 1.045810 | 2.805989 | -0.561113 | -0.367956 | 0.032736 | -0.042333 | -0.322674 | 0.499167 | -0.572665 | 0.346009 | -0.047407 | -0.098964 | -0.663284 | 0.181411 | -0.124345 | -0.790453 | -0.720944 | -0.084556 | -0.240105 | -0.680315 | 0.085328 | 0.684812 | 0.318620 | -0.204963 | 0.001662 | 0.037894 | 49.67 |
12844 | 22555.0 | -1.665159 | 0.808440 | 1.805627 | 1.903416 | -0.821627 | 0.934790 | -0.824802 | 0.975890 | 1.747469 | -0.658751 | 1.281502 | -1.430087 | 0.372028 | 1.403024 | -2.739413 | -1.331766 | 1.964590 | -0.205639 | 1.325588 | -0.373759 | -0.335332 | -0.510994 | 0.035839 | 0.147565 | -0.529358 | -0.566950 | -0.595998 | -0.220086 | 16.94 |
2873 | 2431.0 | -0.324096 | 0.601836 | 0.865329 | -2.138000 | 0.294663 | -1.251553 | 1.072114 | -0.334896 | 1.071268 | -1.109522 | -1.016020 | -0.654945 | -1.473470 | 0.317345 | 1.067491 | -0.372642 | -0.674725 | 0.369841 | 0.095583 | -0.039868 | 0.012220 | 0.352856 | -0.341505 | -0.145791 | 0.094194 | -0.804026 | 0.229428 | -0.021623 | 1.00 |
145263 | 86773.0 | -0.258270 | 1.217501 | -0.585348 | -0.875347 | 1.222481 | -0.311027 | 1.073860 | -0.161408 | 0.200665 | 0.154307 | 0.882673 | 0.547890 | 0.269484 | -1.253302 | -0.883963 | 0.495221 | -0.153212 | 0.296710 | 0.136148 | 0.382305 | -0.424626 | -0.781158 | 0.019316 | 0.178614 | -0.315616 | 0.096665 | 0.269740 | -0.020635 | 10.78 |
186658 | 127202.0 | 2.142162 | -0.494988 | -1.936511 | -0.818288 | -0.025213 | -1.027245 | -0.151627 | -0.305750 | -0.869482 | 0.428729 | 1.136666 | 0.273476 | 0.697123 | -1.222134 | -0.938820 | 1.298149 | 0.912921 | -0.793721 | 1.064984 | 0.106592 | 0.010115 | 0.021722 | 0.079463 | -0.480899 | 0.023846 | -0.279076 | -0.030121 | -0.043888 | 39.96 |
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig,
ser_ytrain_orig,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_orig)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(df_Xtrain.shape)
(182276, 30)
https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
class CatBoostRegressor(
iterations=None, learning_rate=None,
depth=None, l2_leaf_reg=None,
model_size_reg=None, rsm=None,
loss_function='RMSE', border_count=None,
feature_border_type=None, per_float_feature_quantization=None,
input_borders=None, output_borders=None,
fold_permutation_block=None, od_pval=None,
od_wait=None, od_type=None,
nan_mode=None, counter_calc_method=None,
leaf_estimation_iterations=None, leaf_estimation_method=None,
thread_count=None, random_seed=None,
use_best_model=None, best_model_min_trees=None,
verbose=None, silent=None,
logging_level=None, metric_period=None,
ctr_leaf_count_limit=None, store_all_simple_ctr=None,
max_ctr_complexity=None, has_time=None,
allow_const_label=None, one_hot_max_size=None,
random_strength=None,name=None, ignored_features=None,
train_dir=None, custom_metric=None,
eval_metric=None, bagging_temperature=None,
save_snapshot=None, snapshot_file=None,
snapshot_interval=None, fold_len_multiplier=None,
used_ram_limit=None, gpu_ram_part=None,
pinned_memory_size=None, allow_writing_files=None,
final_ctr_computation_mode=None, approx_on_full_history=None,
boosting_type=None, simple_ctr=None,
combinations_ctr=None, per_feature_ctr=None,
ctr_target_border_count=None, task_type=None,
device_config=None, devices=None,
bootstrap_type=None, subsample=None,
sampling_unit=None, dev_score_calc_obj_block_size=None,
max_depth=None, n_estimators=None,
num_boost_round=None, num_trees=None,
colsample_bylevel=None, random_state=None,
reg_lambda=None, objective=None,
eta=None, max_bin=None,
gpu_cat_features_storage=None, data_partition=None,
metadata=None, early_stopping_rounds=None,
cat_features=None, grow_policy=None,
min_data_in_leaf=None, min_child_samples=None,
max_leaves=None, num_leaves=None,
score_function=None, leaf_estimation_backtracking=None,
ctr_history_unit=None, monotone_constraints=None
)
%%time
model = CatBoostClassifier(verbose=100,random_state=SEED,
eval_metric=ProfitMetric())
model.fit(df_Xtrain,ser_ytrain)
ypreds = model.predict(df_Xtest)
cm = sklearn.metrics.confusion_matrix(np.array(ser_ytest),ypreds)
print('confusion matrix\n',cm)
confusion matrix [[56859 5] [ 24 74]] CPU times: user 152 ms, sys: 3.76 ms, total: 156 ms Wall time: 168 ms
profit = get_profit(ser_ytest, ypreds)
print(f"profit = ${profit:,}")
profit = $24,300
yprobs = model.predict_proba(df_Xtest)
print(yprobs[:5])
[[9.99998192e-01 1.80796544e-06] [9.99932577e-01 6.74226697e-05] [9.99998670e-01 1.32969613e-06] [9.99996711e-01 3.28938804e-06] [9.99994773e-01 5.22686104e-06]]
yprobs1d = yprobs[:,1]
df_eval = model_evaluation('xgb', 'custom loss', ser_ytest, yprobs1d)
Model | Description | Accuracy | Precision | Recall | F1 | AUC | AUCPR | |
---|---|---|---|---|---|---|---|---|
0 | xgb | custom loss | 0.999491 | 0.936709 | 0.755102 | 0.836158 | 0.987990 | 0.848748 |
from scikitplot import metrics as skpmetrics
skpmetrics.plot_confusion_matrix(ser_ytest, ypreds)
<matplotlib.axes._subplots.AxesSubplot at 0x7f93d228c050>
fig, ax = plt.subplots(figsize=(12,8))
skpmetrics.plot_roc(ser_ytest,yprobs,ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f93d2415fd0>
notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole noteook: 0 hr 9 min 21 secs