import time

time_start_notebook = time.time()


%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # usual imports
    !pip install watermark
    !pip install scikit-plot

    # Special
    !pip install featuretools[complete]
    !pip install composeml
    !pip install evalml

    print('Environment: Google Colab')


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as skmetrics

# special
import imblearn

# featuretools
import composeml as cp
import featuretools as ft
import evalml

# warnings
import warnings

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# settings
sns.set()
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly

%matplotlib inline
%load_ext watermark
%watermark -iv

2020-12-20 15:48:26,799 featuretools - WARNING    Featuretools failed to load plugin nlp_primitives from library nlp_primitives. For a full stack trace, set logging to debug.
featuretools  : 0.22.0
numpy         : 1.19.4
composeml     : 0.5.1
joblib        : 1.0.0
pandas        : 1.1.5
matplotlib    : 3.3.3
autopep8      : 1.5.4
sys           : 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
imblearn      : 0.7.0
evalml        : 0.16.1
plotly_express: 0.4.1
seaborn       : 0.11.0
json          : 2.0.9
sklearn       : 0.23.2


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


path_data_raw = '../data/raw/creditcard.csv.zip'

if ENV_COLAB:
    path_data_raw = ('https://github.com/bhishanpdl/Datasets/blob/'
                       'master/Projects/Fraud_detection/'
                       'raw/creditcard.csv.zip?raw=true')


df = pd.read_csv(path_data_raw,compression='zip')
print(df.shape)

df.head(2).append(df.tail(2))

(284807, 31)


target_name = 'Class'


from sklearn.model_selection import train_test_split

target = 'Class'

df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],
    test_size=0.2, random_state=SEED, stratify=df[target])

ytrain = np.array(ser_ytrain).flatten()
ytest  = np.array(ser_ytest).flatten()

print(f"""
           nrows       ncols             
df         {df.shape[0]:,d}     {df.shape[1]:,d}
df_Xtrain  {df_Xtrain.shape[0]:,d}    {df_Xtrain.shape[1]:,d}
ser_ytrain {len(ser_ytrain):,d}

df_Xtest   {df_Xtest.shape[0]:,d}    {df_Xtest.shape[1]:,d}
ser_ytest  {len(ser_ytest):,d}
""")
df_Xtrain.head(2)

           nrows       ncols             
df         284,807     31
df_Xtrain  227,845    30
ser_ytrain 227,845

df_Xtest   56,962    30
ser_ytest  56,962


px.histogram(x=ser_ytrain, width=300,height=200)


import evalml
from evalml.objectives import CostBenefitMatrix
from evalml import AutoMLSearch


show_methods(evalml)


show_methods(evalml.objectives)


from evalml.objectives import FraudCost

obj_fraud = FraudCost(retry_percentage=.5,
                    interchange_fee=.02,
                    fraud_payout_percentage=.75,
                    amount_col='Amount')


from evalml import AutoMLSearch


%%time

automl = AutoMLSearch(problem_type='binary',
                      objective=obj_fraud,
                      additional_objectives=['auc', 'f1', 'precision'],
                      max_batches=1,
                      optimize_thresholds=True)

automl.search(df_Xtrain, ytrain)

`X` passed was not a DataTable. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead.
`y` passed was not a DataColumn. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead.
{'message': 'The following labels fall below 10% of the target: [1]', 'data_check_name': 'ClassImbalanceDataCheck', 'level': 'warning', 'code': 'CLASS_IMBALANCE_BELOW_THRESHOLD', 'details': {'target_values': [1]}}
Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Fraud Cost. 
Lower score is better.

Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: random_forest, xgboost, linear_model, lightgbm, extra_trees, decision_tree, catboost

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.001
Batch 1: (2/9) Decision Tree Classifier w/ Imputer      Elapsed:00:04
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.001
Batch 1: (3/9) LightGBM Classifier w/ Imputer           Elapsed:00:08
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.001
Batch 1: (4/9) Extra Trees Classifier w/ Imputer        Elapsed:00:14
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.001
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + S... Elapsed:00:21
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.001
Batch 1: (6/9) CatBoost Classifier w/ Imputer           Elapsed:00:31
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.001
Batch 1: (7/9) XGBoost Classifier w/ Imputer            Elapsed:00:48
	Starting cross validation

/Users/poudel/opt/miniconda3/envs/ft/lib/python3.8/site-packages/xgboost/sklearn.py:888: UserWarning:

The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].

[15:49:24] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
	Finished cross validation - mean Fraud Cost: 0.001
Batch 1: (8/9) Random Forest Classifier w/ Imputer      Elapsed:01:03
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.001
Batch 1: (9/9) Logistic Regression Classifier w/ Imp... Elapsed:01:11
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.001

Search finished after 01:17            
Best pipeline: LightGBM Classifier w/ Imputer
Best pipeline Fraud Cost: 0.000781
CPU times: user 1min 38s, sys: 3.05 s, total: 1min 41s
Wall time: 1min 19s


automl.rankings


pipe_best = automl.best_pipeline
pipe_best

GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'LightGBM Classifier':{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_freq': 0, 'bagging_fraction': 0.9},})


automl.describe_pipeline(automl.rankings.iloc[1]["id"])

************************************************
* Mode Baseline Binary Classification Pipeline *
************************************************

Problem Type: binary
Model Family: Baseline

Pipeline Steps
==============
1. Baseline Classifier
	 * strategy : mode

Training
========
Training for binary problems.
Objective to optimize binary classification pipeline thresholds for: <evalml.objectives.fraud_cost.FraudCost object at 0x7f827a5c0ca0>
Total training time (including CV): 4.9 seconds

Cross Validation
----------------
            Fraud Cost   AUC    F1 Precision # Training  # Testing
0                0.001 0.500 0.000     0.000  45568.000 170884.000
mean             0.001 0.500 0.000     0.000          -          -
std                  -     -     -         -          -          -
coef of var          -     -   inf       inf          -          -


pipe_best.fit(df_Xtrain, ytrain)

GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'LightGBM Classifier':{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_freq': 0, 'bagging_fraction': 0.9},})


objectives = ["auc","f1","precision", "recall", obj_fraud]
pipe_best.score(df_Xtest, ytest, objectives=objectives)

OrderedDict([('AUC', 0.5192970266328253),
             ('F1', 0.21052631578947367),
             ('Precision', 0.1553398058252427),
             ('Recall', 0.32653061224489793),
             ('Fraud Cost', 0.0004607514822574961)])


show_methods(evalml.model_understanding)


from evalml.model_understanding import graphs as evgraphs
show_methods(evgraphs)


ytest = np.array(ser_ytest).flatten()
ypreds = np.array(pipe_best.predict(df_Xtest)).flatten()
yprobs2d = pipe_best.predict_proba(df_Xtest).to_numpy()
yprobs = yprobs2d[:,1]


evgraphs.graph_confusion_matrix(ytest, ypreds)


evgraphs.graph_roc_curve(ytest,yprobs)


evgraphs.graph_precision_recall_curve(ytest,yprobs)


show_methods(evgraphs,contains='graph')


df_Xtrain.head(2)


feature = 'Time'
evgraphs.graph_partial_dependence(pipe_best,df_Xtest,feature=feature)


N = 100
evgraphs.graph_binary_objective_vs_threshold(
    pipe_best,df_Xtest.iloc[:N],ytest[:N],objective=obj_fraud)


evgraphs.graph_permutation_importance(pipe_best,df_Xtest.iloc[:N],ytest[:N],
                                      objective=obj_fraud)


import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics


def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
    import sklearn.metrics as skmetrics
    import scikitplot.metrics as skpmetrics
    import os

    acc       = skmetrics.accuracy_score(ytest,ypreds)
    precision = skmetrics.precision_score(ytest,ypreds)
    recall    = skmetrics.recall_score(ytest,ypreds)
    f1        = skmetrics.f1_score(ytest,ypreds)
    auc       = skmetrics.roc_auc_score(ytest,ypreds)

    print(skmetrics.classification_report(ytest,ypreds))
    print(skmetrics.confusion_matrix(ytest,ypreds))

    df_res = pd.DataFrame({'Accuracy':[acc],
                          'Precision': [precision],
                          'Recall': [recall],
                          'F1-score': [f1],
                          'AUC': [auc]},index=[model_name])

    display(df_res.style.format("{:.4f}"))
    if not os.path.isdir('../outputs'):
        os.makedirs('../outputs')
    o = '.' if ENV_COLAB else '../outputs/'
    df_res.to_csv(o+f'model_{model_name}.csv',index=True)

    if show_plots:
        skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
        skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
        skpmetrics.plot_confusion_matrix(ytest,ypreds)

model_eval_bin('evalml',ytest,ypreds,yprobs2d,show_plots=True)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.16      0.33      0.21        98

    accuracy                           1.00     56962
   macro avg       0.58      0.66      0.60     56962
weighted avg       1.00      1.00      1.00     56962

[[56690   174]
 [   66    32]]


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 2 min 18 secs

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
284805	172788.0	-0.240440	0.530483	0.702510	0.689799	-0.377961	0.623708	-0.686180	0.679145	0.392087	-0.399126	-1.933849	-0.962886	-1.042082	0.449624	1.962563	-0.608577	0.509928	1.113981	2.897849	0.127434	0.265245	0.800049	-0.163298	0.123205	-0.569159	0.546668	0.108821	0.104533	10.00
284806	172792.0	-0.533413	-0.189733	0.703337	-0.506271	-0.012546	-0.649617	1.577006	-0.414650	0.486180	-0.915427	-1.040458	-0.031513	-0.188093	-0.084316	0.041333	-0.302620	-0.660377	0.167430	-0.256117	0.382948	0.261057	0.643078	0.376777	0.008797	-0.473649	-0.818267	-0.002415	0.013649	217.00

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
211885	138616.0	-1.137612	2.345154	-1.767247	0.833982	0.973168	-0.073571	0.802433	0.733137	-1.154087	-0.520340	0.494117	0.799935	0.494576	-0.479666	-0.917177	-0.184117	1.189459	0.937244	0.960749	0.062820	0.114953	0.430613	-0.240819	0.124011	0.187187	-0.402251	0.196277	0.190732	39.46
12542	21953.0	-1.028649	1.141569	2.492561	-0.242233	0.452842	-0.384273	1.256026	-0.816401	1.964560	-0.014216	0.432153	-2.140921	2.274477	0.114128	-1.652894	-0.617302	0.243791	-0.426168	-0.493177	0.350032	-0.380356	-0.037432	-0.503934	0.407129	0.604252	0.233015	-0.433132	-0.491892	7.19

	0	1	2	3
0	AUC	F1Weighted	Precision	cost_benefit_matrix
1	AUCMacro	FraudCost	PrecisionMacro	fraud_cost
2	AUCMicro	LeadScoring	PrecisionMicro	get_all_objective_names
3	AUCWeighted	LogLossBinary	PrecisionWeighted	get_core_objective_names
4	AccuracyBinary	LogLossMulticlass	R2	get_core_objectives
5	AccuracyMulticlass	MAE	Recall	get_non_core_objectives
6	BalancedAccuracyBinary	MCCBinary	RecallMacro	get_objective
7	BalancedAccuracyMulticlass	MCCMulticlass	RecallMicro	lead_scoring
8	BinaryClassificationObjective	MSE	RecallWeighted	multiclass_classification_objective
9	CostBenefitMatrix	MaxError	RegressionObjective	objective_base
10	ExpVariance	MeanSquaredLogError	RootMeanSquaredError	regression_objective
11	F1	MedianAE	RootMeanSquaredLogError	standard_metrics
12	F1Macro	MulticlassClassificationObjective	binary_classification_objective	utils
13	F1Micro	ObjectiveBase

	id	pipeline_name	score	validation_score	percent_better_than_baseline	high_variance_cv	parameters
0	2	LightGBM Classifier w/ Imputer	0.000781	0.000781	21.548971	False	{'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'LightGBM Classifier': {'boosting_type'...
1	0	Mode Baseline Binary Classification Pipeline	0.000996	0.000996	0.000000	False	{'Baseline Classifier': {'strategy': 'mode'}}
2	1	Decision Tree Classifier w/ Imputer	0.000996	0.000996	0.000000	False	{'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Decision Tree Classifier': {'criterion...
3	3	Extra Trees Classifier w/ Imputer	0.000996	0.000996	0.000000	False	{'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Extra Trees Classifier': {'n_estimator...
4	4	Elastic Net Classifier w/ Imputer + Standard Scaler	0.000996	0.000996	0.000000	False	{'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Elastic Net Classifier': {'alpha': 0.5...
5	5	CatBoost Classifier w/ Imputer	0.000996	0.000996	0.000000	False	{'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'CatBoost Classifier': {'n_estimators':...
6	6	XGBoost Classifier w/ Imputer	0.000996	0.000996	0.000000	False	{'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'XGBoost Classifier': {'eta': 0.1, 'max...
7	7	Random Forest Classifier w/ Imputer	0.000996	0.000996	0.000000	False	{'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Random Forest Classifier': {'n_estimat...
8	8	Logistic Regression Classifier w/ Imputer + Standard Scaler	0.000996	0.000996	0.000000	False	{'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Logistic Regression Classifier': {'pen...

	0	1	2	3
0	LabelBinarizer	get_objective	import_or_raise	sk_partial_dependence
1	ModelFamily	graph_binary_objective_vs_threshold	jupyter_check	sk_permutation_importance
2	NullsInColumnWarning	graph_confusion_matrix	normalize_confusion_matrix	sklearn_auc
3	ProblemTypes	graph_partial_dependence	np	sklearn_confusion_matrix
4	binary_objective_vs_threshold	graph_permutation_importance	partial_dependence	sklearn_precision_recall_curve
5	calculate_permutation_importance	graph_precision_recall_curve	pd	sklearn_roc_curve
6	confusion_matrix	graph_prediction_vs_actual	precision_recall_curve	unique_labels
7	copy	graph_roc_curve	roc_curve	warnings
8	evalml

Modelling Fraud Detection using evalML

Load the libraries

Colab¶

Useful Scripts

Load the Data

Data Processing

Train Test Split

Modelling: evalML

Configure Cost of Fraud

AutoML Search

Select the Ojbective: FraudCost

View rankings and select pipeline

Fit the Model

Model Evaluation

Time Taken

	0	1	2	3
0	AutoMLSearch	exceptions	preprocessing	skopt
1	automl	model_family	print_info	tuners
2	data_checks	model_understanding	problem_types	utils
3	demos	objectives	sklearn	warnings
4	evalml	pipelines

	0	1	2	3
0	binary_objective_vs_threshold	explain_predictions_best_worst	graph_precision_recall_curve	partial_dependence
1	calculate_permutation_importance	graph_binary_objective_vs_threshold	graph_prediction_vs_actual	precision_recall_curve
2	confusion_matrix	graph_confusion_matrix	graph_roc_curve	prediction_explanations
3	explain_prediction	graph_partial_dependence	graphs	roc_curve
4	explain_predictions	graph_permutation_importance	normalize_confusion_matrix