References
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# Special
!pip install featuretools[complete]
!pip install composeml
!pip install evalml
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as skmetrics
# special
import imblearn
# featuretools
import composeml as cp
import featuretools as ft
import evalml
# warnings
import warnings
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# settings
sns.set()
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
2020-12-20 15:48:26,799 featuretools - WARNING Featuretools failed to load plugin nlp_primitives from library nlp_primitives. For a full stack trace, set logging to debug. featuretools : 0.22.0 numpy : 1.19.4 composeml : 0.5.1 joblib : 1.0.0 pandas : 1.1.5 matplotlib : 3.3.3 autopep8 : 1.5.4 sys : 3.8.5 (default, Sep 4 2020, 02:22:02) [Clang 10.0.0 ] imblearn : 0.7.0 evalml : 0.16.1 plotly_express: 0.4.1 seaborn : 0.11.0 json : 2.0.9 sklearn : 0.23.2
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
path_data_raw = '../data/raw/creditcard.csv.zip'
if ENV_COLAB:
path_data_raw = ('https://github.com/bhishanpdl/Datasets/blob/'
'master/Projects/Fraud_detection/'
'raw/creditcard.csv.zip?raw=true')
df = pd.read_csv(path_data_raw,compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))
(284807, 31)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | -0.551600 | -0.617801 | -0.991390 | -0.311169 | 1.468177 | -0.470401 | 0.207971 | 0.025791 | 0.403993 | 0.251412 | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | 1.612727 | 1.065235 | 0.489095 | -0.143772 | 0.635558 | 0.463917 | -0.114805 | -0.183361 | -0.145783 | -0.069083 | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
284805 | 172788.0 | -0.240440 | 0.530483 | 0.702510 | 0.689799 | -0.377961 | 0.623708 | -0.686180 | 0.679145 | 0.392087 | -0.399126 | -1.933849 | -0.962886 | -1.042082 | 0.449624 | 1.962563 | -0.608577 | 0.509928 | 1.113981 | 2.897849 | 0.127434 | 0.265245 | 0.800049 | -0.163298 | 0.123205 | -0.569159 | 0.546668 | 0.108821 | 0.104533 | 10.00 | 0 |
284806 | 172792.0 | -0.533413 | -0.189733 | 0.703337 | -0.506271 | -0.012546 | -0.649617 | 1.577006 | -0.414650 | 0.486180 | -0.915427 | -1.040458 | -0.031513 | -0.188093 | -0.084316 | 0.041333 | -0.302620 | -0.660377 | 0.167430 | -0.256117 | 0.382948 | 0.261057 | 0.643078 | 0.376777 | 0.008797 | -0.473649 | -0.818267 | -0.002415 | 0.013649 | 217.00 | 0 |
target_name = 'Class'
from sklearn.model_selection import train_test_split
target = 'Class'
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
df.drop(target,axis=1), df[target],
test_size=0.2, random_state=SEED, stratify=df[target])
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
print(f"""
nrows ncols
df {df.shape[0]:,d} {df.shape[1]:,d}
df_Xtrain {df_Xtrain.shape[0]:,d} {df_Xtrain.shape[1]:,d}
ser_ytrain {len(ser_ytrain):,d}
df_Xtest {df_Xtest.shape[0]:,d} {df_Xtest.shape[1]:,d}
ser_ytest {len(ser_ytest):,d}
""")
df_Xtrain.head(2)
nrows ncols df 284,807 31 df_Xtrain 227,845 30 ser_ytrain 227,845 df_Xtest 56,962 30 ser_ytest 56,962
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
211885 | 138616.0 | -1.137612 | 2.345154 | -1.767247 | 0.833982 | 0.973168 | -0.073571 | 0.802433 | 0.733137 | -1.154087 | -0.520340 | 0.494117 | 0.799935 | 0.494576 | -0.479666 | -0.917177 | -0.184117 | 1.189459 | 0.937244 | 0.960749 | 0.062820 | 0.114953 | 0.430613 | -0.240819 | 0.124011 | 0.187187 | -0.402251 | 0.196277 | 0.190732 | 39.46 |
12542 | 21953.0 | -1.028649 | 1.141569 | 2.492561 | -0.242233 | 0.452842 | -0.384273 | 1.256026 | -0.816401 | 1.964560 | -0.014216 | 0.432153 | -2.140921 | 2.274477 | 0.114128 | -1.652894 | -0.617302 | 0.243791 | -0.426168 | -0.493177 | 0.350032 | -0.380356 | -0.037432 | -0.503934 | 0.407129 | 0.604252 | 0.233015 | -0.433132 | -0.491892 | 7.19 |
px.histogram(x=ser_ytrain, width=300,height=200)
import evalml
from evalml.objectives import CostBenefitMatrix
from evalml import AutoMLSearch
show_methods(evalml)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | AutoMLSearch | exceptions | preprocessing | skopt |
1 | automl | model_family | print_info | tuners |
2 | data_checks | model_understanding | problem_types | utils |
3 | demos | objectives | sklearn | warnings |
4 | evalml | pipelines |
To optimize the pipelines toward the specific business needs of this model, you can set your own assumptions for the cost of fraud. These parameters are
retry_percentage
- what percentage of customers will retry a transaction if it is declined?
interchange_fee
- how much of each successful transaction do you collect?
fraud_payout_percentage
- the percentage of fraud will you be unable to collect
amount_col
- the column in the data the represents the transaction amount
show_methods(evalml.objectives)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | AUC | F1Weighted | Precision | cost_benefit_matrix |
1 | AUCMacro | FraudCost | PrecisionMacro | fraud_cost |
2 | AUCMicro | LeadScoring | PrecisionMicro | get_all_objective_names |
3 | AUCWeighted | LogLossBinary | PrecisionWeighted | get_core_objective_names |
4 | AccuracyBinary | LogLossMulticlass | R2 | get_core_objectives |
5 | AccuracyMulticlass | MAE | Recall | get_non_core_objectives |
6 | BalancedAccuracyBinary | MCCBinary | RecallMacro | get_objective |
7 | BalancedAccuracyMulticlass | MCCMulticlass | RecallMicro | lead_scoring |
8 | BinaryClassificationObjective | MSE | RecallWeighted | multiclass_classification_objective |
9 | CostBenefitMatrix | MaxError | RegressionObjective | objective_base |
10 | ExpVariance | MeanSquaredLogError | RootMeanSquaredError | regression_objective |
11 | F1 | MedianAE | RootMeanSquaredLogError | standard_metrics |
12 | F1Macro | MulticlassClassificationObjective | binary_classification_objective | utils |
13 | F1Micro | ObjectiveBase |
from evalml.objectives import FraudCost
obj_fraud = FraudCost(retry_percentage=.5,
interchange_fee=.02,
fraud_payout_percentage=.75,
amount_col='Amount')
AutoMLSearch(
problem_type = None,
objective = 'auto', # r2, log loss binary/multiclass
max_iterations = None,
max_time = None, # inteter is seconds, strings: minutes hours
patience = None, # default is no early stopping
tolerance = None,
data_split = None, # StratifiedKFold
allowed_pipelines = None,
allowed_model_families = None,
start_iteration_callback = None,
add_result_callback = None,
error_callback = None,
additional_objectives = None,
random_state = 0,
n_jobs = -1,
tuner_class = None,
verbose = True,
optimize_thresholds = False,
ensembling = False,
max_batches = None,
problem_configuration = None,
_pipelines_per_batch = 5,
)
automl.search(X, y,
data_checks='auto',
show_iteration_plot=True)
from evalml import AutoMLSearch
%%time
automl = AutoMLSearch(problem_type='binary',
objective=obj_fraud,
additional_objectives=['auc', 'f1', 'precision'],
max_batches=1,
optimize_thresholds=True)
automl.search(df_Xtrain, ytrain)
`X` passed was not a DataTable. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead. `y` passed was not a DataColumn. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead. {'message': 'The following labels fall below 10% of the target: [1]', 'data_check_name': 'ClassImbalanceDataCheck', 'level': 'warning', 'code': 'CLASS_IMBALANCE_BELOW_THRESHOLD', 'details': {'target_values': [1]}} Generating pipelines to search over... ***************************** * Beginning pipeline search * ***************************** Optimizing for Fraud Cost. Lower score is better. Searching up to 1 batches for a total of 9 pipelines. Allowed model families: random_forest, xgboost, linear_model, lightgbm, extra_trees, decision_tree, catboost
Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00 Starting cross validation Finished cross validation - mean Fraud Cost: 0.001 Batch 1: (2/9) Decision Tree Classifier w/ Imputer Elapsed:00:04 Starting cross validation Finished cross validation - mean Fraud Cost: 0.001 Batch 1: (3/9) LightGBM Classifier w/ Imputer Elapsed:00:08 Starting cross validation Finished cross validation - mean Fraud Cost: 0.001 Batch 1: (4/9) Extra Trees Classifier w/ Imputer Elapsed:00:14 Starting cross validation Finished cross validation - mean Fraud Cost: 0.001 Batch 1: (5/9) Elastic Net Classifier w/ Imputer + S... Elapsed:00:21 Starting cross validation Finished cross validation - mean Fraud Cost: 0.001 Batch 1: (6/9) CatBoost Classifier w/ Imputer Elapsed:00:31 Starting cross validation Finished cross validation - mean Fraud Cost: 0.001 Batch 1: (7/9) XGBoost Classifier w/ Imputer Elapsed:00:48 Starting cross validation
/Users/poudel/opt/miniconda3/envs/ft/lib/python3.8/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
[15:49:24] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Finished cross validation - mean Fraud Cost: 0.001 Batch 1: (8/9) Random Forest Classifier w/ Imputer Elapsed:01:03 Starting cross validation Finished cross validation - mean Fraud Cost: 0.001 Batch 1: (9/9) Logistic Regression Classifier w/ Imp... Elapsed:01:11 Starting cross validation Finished cross validation - mean Fraud Cost: 0.001 Search finished after 01:17 Best pipeline: LightGBM Classifier w/ Imputer Best pipeline Fraud Cost: 0.000781 CPU times: user 1min 38s, sys: 3.05 s, total: 1min 41s Wall time: 1min 19s
automl.rankings
id | pipeline_name | score | validation_score | percent_better_than_baseline | high_variance_cv | parameters | |
---|---|---|---|---|---|---|---|
0 | 2 | LightGBM Classifier w/ Imputer | 0.000781 | 0.000781 | 21.548971 | False | {'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'LightGBM Classifier': {'boosting_type'... |
1 | 0 | Mode Baseline Binary Classification Pipeline | 0.000996 | 0.000996 | 0.000000 | False | {'Baseline Classifier': {'strategy': 'mode'}} |
2 | 1 | Decision Tree Classifier w/ Imputer | 0.000996 | 0.000996 | 0.000000 | False | {'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Decision Tree Classifier': {'criterion... |
3 | 3 | Extra Trees Classifier w/ Imputer | 0.000996 | 0.000996 | 0.000000 | False | {'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Extra Trees Classifier': {'n_estimator... |
4 | 4 | Elastic Net Classifier w/ Imputer + Standard Scaler | 0.000996 | 0.000996 | 0.000000 | False | {'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Elastic Net Classifier': {'alpha': 0.5... |
5 | 5 | CatBoost Classifier w/ Imputer | 0.000996 | 0.000996 | 0.000000 | False | {'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'CatBoost Classifier': {'n_estimators':... |
6 | 6 | XGBoost Classifier w/ Imputer | 0.000996 | 0.000996 | 0.000000 | False | {'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'XGBoost Classifier': {'eta': 0.1, 'max... |
7 | 7 | Random Forest Classifier w/ Imputer | 0.000996 | 0.000996 | 0.000000 | False | {'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Random Forest Classifier': {'n_estimat... |
8 | 8 | Logistic Regression Classifier w/ Imputer + Standard Scaler | 0.000996 | 0.000996 | 0.000000 | False | {'Imputer': {'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'Logistic Regression Classifier': {'pen... |
pipe_best = automl.best_pipeline
pipe_best
GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'LightGBM Classifier':{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_freq': 0, 'bagging_fraction': 0.9},})
automl.describe_pipeline(automl.rankings.iloc[1]["id"])
************************************************ * Mode Baseline Binary Classification Pipeline * ************************************************ Problem Type: binary Model Family: Baseline Pipeline Steps ============== 1. Baseline Classifier * strategy : mode Training ======== Training for binary problems. Objective to optimize binary classification pipeline thresholds for: <evalml.objectives.fraud_cost.FraudCost object at 0x7f827a5c0ca0> Total training time (including CV): 4.9 seconds Cross Validation ---------------- Fraud Cost AUC F1 Precision # Training # Testing 0 0.001 0.500 0.000 0.000 45568.000 170884.000 mean 0.001 0.500 0.000 0.000 - - std - - - - - - coef of var - - inf inf - -
pipe_best.fit(df_Xtrain, ytrain)
GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'LightGBM Classifier':{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_freq': 0, 'bagging_fraction': 0.9},})
objectives = ["auc","f1","precision", "recall", obj_fraud]
pipe_best.score(df_Xtest, ytest, objectives=objectives)
OrderedDict([('AUC', 0.5192970266328253), ('F1', 0.21052631578947367), ('Precision', 0.1553398058252427), ('Recall', 0.32653061224489793), ('Fraud Cost', 0.0004607514822574961)])
show_methods(evalml.model_understanding)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | binary_objective_vs_threshold | explain_predictions_best_worst | graph_precision_recall_curve | partial_dependence |
1 | calculate_permutation_importance | graph_binary_objective_vs_threshold | graph_prediction_vs_actual | precision_recall_curve |
2 | confusion_matrix | graph_confusion_matrix | graph_roc_curve | prediction_explanations |
3 | explain_prediction | graph_partial_dependence | graphs | roc_curve |
4 | explain_predictions | graph_permutation_importance | normalize_confusion_matrix |
from evalml.model_understanding import graphs as evgraphs
show_methods(evgraphs)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | LabelBinarizer | get_objective | import_or_raise | sk_partial_dependence |
1 | ModelFamily | graph_binary_objective_vs_threshold | jupyter_check | sk_permutation_importance |
2 | NullsInColumnWarning | graph_confusion_matrix | normalize_confusion_matrix | sklearn_auc |
3 | ProblemTypes | graph_partial_dependence | np | sklearn_confusion_matrix |
4 | binary_objective_vs_threshold | graph_permutation_importance | partial_dependence | sklearn_precision_recall_curve |
5 | calculate_permutation_importance | graph_precision_recall_curve | pd | sklearn_roc_curve |
6 | confusion_matrix | graph_prediction_vs_actual | precision_recall_curve | unique_labels |
7 | copy | graph_roc_curve | roc_curve | warnings |
8 | evalml |
ytest = np.array(ser_ytest).flatten()
ypreds = np.array(pipe_best.predict(df_Xtest)).flatten()
yprobs2d = pipe_best.predict_proba(df_Xtest).to_numpy()
yprobs = yprobs2d[:,1]
evgraphs.graph_confusion_matrix(ytest, ypreds)
evgraphs.graph_roc_curve(ytest,yprobs)
evgraphs.graph_precision_recall_curve(ytest,yprobs)
show_methods(evgraphs,contains='graph')
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | graph_binary_objective_vs_threshold | graph_partial_dependence | graph_precision_recall_curve | graph_roc_curve |
1 | graph_confusion_matrix | graph_permutation_importance | graph_prediction_vs_actual |
df_Xtrain.head(2)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
211885 | 138616.0 | -1.137612 | 2.345154 | -1.767247 | 0.833982 | 0.973168 | -0.073571 | 0.802433 | 0.733137 | -1.154087 | -0.520340 | 0.494117 | 0.799935 | 0.494576 | -0.479666 | -0.917177 | -0.184117 | 1.189459 | 0.937244 | 0.960749 | 0.062820 | 0.114953 | 0.430613 | -0.240819 | 0.124011 | 0.187187 | -0.402251 | 0.196277 | 0.190732 | 39.46 |
12542 | 21953.0 | -1.028649 | 1.141569 | 2.492561 | -0.242233 | 0.452842 | -0.384273 | 1.256026 | -0.816401 | 1.964560 | -0.014216 | 0.432153 | -2.140921 | 2.274477 | 0.114128 | -1.652894 | -0.617302 | 0.243791 | -0.426168 | -0.493177 | 0.350032 | -0.380356 | -0.037432 | -0.503934 | 0.407129 | 0.604252 | 0.233015 | -0.433132 | -0.491892 | 7.19 |
feature = 'Time'
evgraphs.graph_partial_dependence(pipe_best,df_Xtest,feature=feature)
N = 100
evgraphs.graph_binary_objective_vs_threshold(
pipe_best,df_Xtest.iloc[:N],ytest[:N],objective=obj_fraud)
evgraphs.graph_permutation_importance(pipe_best,df_Xtest.iloc[:N],ytest[:N],
objective=obj_fraud)
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
model_eval_bin('evalml',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support 0 1.00 1.00 1.00 56864 1 0.16 0.33 0.21 98 accuracy 1.00 56962 macro avg 0.58 0.66 0.60 56962 weighted avg 1.00 1.00 1.00 56962 [[56690 174] [ 66 32]]
Accuracy | Precision | Recall | F1-score | AUC | |
---|---|---|---|---|---|
evalml | 0.9958 | 0.1553 | 0.3265 | 0.2105 | 0.6617 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 2 min 18 secs