import time
time_start_notebook = time.time()

import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.16.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
from bhishan import bp


df = pd.read_csv('../data/raw/creditcard.csv.zip',compression='zip')
print(df.shape)
df.head()

(284807, 31)


target = 'Class'
df[target].value_counts()

0    284315
1       492
Name: Class, dtype: int64


df[target].value_counts(normalize=True)*1000

0    998.272514
1      1.727486
Name: Class, dtype: float64


# RobustScaler is less prone to outliers.
from sklearn.preprocessing import StandardScaler, RobustScaler

scaler = RobustScaler()

df['scaled_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = scaler.fit_transform(df['Time'].values.reshape(-1,1))


# without removing outliers
n = df[target].value_counts().values[-1]

df_under = (df.groupby(target)
                .apply(lambda x: x.sample(n,random_state=SEED))
                .reset_index(drop=True)
               )

df_under[target].value_counts()

1    492
0    492
Name: Class, dtype: int64


df.shape, df_under.shape
# out of 284k samples, we now have 984 samples for undersampling
# we have lost 283k samples and have only 1k samples
# this is a lot of information losss, but still I will test the 
# classifiers with this undersampling method.
#
# Later, I will use oversampling methods to do the modelling.

((284807, 33), (984, 33))


from sklearn.model_selection import train_test_split


df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class', 'scaled_amount', 'scaled_time'],
      dtype='object')


(Xtrain_under, Xtest_under,
 ytrain_under, ytest_under) = \
    train_test_split(df_under.drop([target],1),
                     df_under[target],
                     random_state=SEED,
                     test_size=0.2,
                     #stratify=df_under[target] # do no use stratify here.
                     )

print(df.shape, Xtrain_under.shape, Xtest_under.shape)
columns = df.columns.difference([target]).values.tolist() + [target]

df_train_under = pd.DataFrame(data=np.c_[Xtrain_under,
                                         ytrain_under],
                              columns=columns)

df_test_under = pd.DataFrame(data=np.c_[Xtest_under
                                        ,ytest_under],
                             columns=columns)

print(df.shape, df_train_under.shape, df_test_under.shape)
df_train_under.head(2)

(284807, 33) (787, 32) (197, 32)
(284807, 33) (787, 33) (197, 33)


df_train_under[target].value_counts()

1.0    401
0.0    386
Name: Class, dtype: int64


df_test_under[target].value_counts()

0.0    106
1.0     91
Name: Class, dtype: int64


for x in [df, df_under,
          df_train_under, df_test_under,
         ]:
    print(x.isnull().sum().sum())

0
0
0
0


df_under.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class', 'scaled_amount', 'scaled_time'],
      dtype='object')


df_train_under.columns

Index(['Amount', 'Time', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16',
       'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25',
       'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'scaled_amount', 'scaled_time', 'Class'],
      dtype='object')


features_with_log = df_under.columns.difference(
    ['Amount','Time','Class']).values.tolist()

features = features_with_log
print(features)

['V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'scaled_amount', 'scaled_time']


# numpy arrays
Xtrain = df_train_under[features].values
Xtest = df_test_under[features].values

ytrain = df_train_under[target].values.ravel()
ytest = df_test_under[target].values.ravel()

Xtrain.shape, ytrain.shape, Xtest.shape,  ytest.shape

((787, 30), (787,), (197, 30), (197,))


from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


model = DecisionTreeClassifier(max_depth=3,random_state=SEED,
                               min_samples_leaf=20,min_samples_split=20)


model.fit(Xtrain,ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=100, splitter='best')


df_imp = pd.Series(model.feature_importances_, index=features)
df_imp.sort_values(ascending=False,inplace=True)
df_imp

V20              0.946110
V11              0.035036
V2               0.009760
V16              0.009094
scaled_time      0.000000
scaled_amount    0.000000
V10              0.000000
V12              0.000000
V13              0.000000
V14              0.000000
V15              0.000000
V17              0.000000
V18              0.000000
V19              0.000000
V21              0.000000
V22              0.000000
V23              0.000000
V24              0.000000
V25              0.000000
V26              0.000000
V27              0.000000
V28              0.000000
V3               0.000000
V4               0.000000
V5               0.000000
V6               0.000000
V7               0.000000
V8               0.000000
V9               0.000000
V1               0.000000
dtype: float64


fig,ax = plt.subplots(figsize=(14,8))
df_imp.plot.bar(ax=ax)
add_text_barplot(ax)


from yellowbrick.model_selection import FeatureImportances

viz = FeatureImportances(model,labels=features,)
viz.fit(Xtrain, ytrain)
viz.show()

<matplotlib.axes._subplots.AxesSubplot at 0x11bb055f8>


from eli5 import show_weights

show_weights(model,feature_names=features)



Tree


0

V20 <= -1.577
gini = 0.5
samples = 100.0%
value = [0.49, 0.51]


1

V20 <= -3.192
gini = 0.072
samples = 47.3%
value = [0.038, 0.962]


0->1


True


6

V11 <= 1.894
gini = 0.186
samples = 52.7%
value = [0.896, 0.104]


0->6


False


2

gini = 0.0
samples = 42.1%
value = [0.0, 1.0]


1->2


3

V16 <= -0.375
gini = 0.45
samples = 5.2%
value = [0.341, 0.659]


1->3


4

gini = 0.255
samples = 2.5%
value = [0.15, 0.85]


3->4


5

gini = 0.499
samples = 2.7%
value = [0.524, 0.476]


3->5


7

V2 <= -1.52
gini = 0.124
samples = 47.8%
value = [0.934, 0.066]


6->7


10

gini = 0.497
samples = 5.0%
value = [0.538, 0.462]


6->10


8

gini = 0.417
samples = 3.4%
value = [0.704, 0.296]


7->8


9

gini = 0.093
samples = 44.3%
value = [0.951, 0.049]


7->9


from sklearn import tree

fig, ax = plt.subplots(figsize=(24,12))

tree.plot_tree(model,feature_names=features,class_names=['NotFraud','Fraud'],
                proportion=True,filled=True, ax=ax,fontsize=14);

plt.savefig('../images/decision_tree.png',dpi=300)


import pydotplus
from IPython.display import Image

dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names=features,  
                                class_names=['Not Fraud','Fraud'])

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

# Show graph
Image(graph.create_png())


from IPython.display import SVG
from graphviz import Source
from IPython.display import display


graph = Source(tree.export_graphviz(model,
                                    out_file=None,
                                    feature_names=features,
                                    class_names=['Not Fraud','Fraud'],
                                    filled = True))

display(SVG(graph.pipe(format='svg')))


from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display                               
from ipywidgets import interactive


def plot_tree(crit, split, depth, min_split, min_leaf=0.2):
    # define the model
    model = DecisionTreeClassifier(random_state=SEED,
                                   criterion=crit, 
                                   splitter=split,
                                   max_depth=depth,
                                   min_samples_split=min_split,
                                   min_samples_leaf=min_leaf)
    
    # fit the model
    model.fit(Xtrain, ytrain)

    # dot file
    graph = Source(tree.export_graphviz(model,
                                        out_file=None,
                                        feature_names=features,
                                        class_names=['Not Fraud', 'Fraud'],
                                        filled=True))

    # display the image
    display(SVG(graph.pipe(format='svg')))
    return model


inter=interactive(plot_tree, crit = ["gini", "entropy"],
                  split = ["best", "random"],
                  depth=[1,2,3,4],
                  min_split=(0.1,1),
                  min_leaf=(0.1,0.5))

display(inter)


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 8 secs


import subprocess
subprocess.call(['python', '-m', 'nbconvert', '*.ipynb'])

0


!mv *.html ../html/


!rm -rf catboost_info

rm: catboost_info: No such file or directory

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	0.624501	0.066084	0.717293	-0.165946	2.345865	-2.890083	1.109969	-0.121359	-2.261857	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	-0.226487	0.178228	0.507757	-0.287924	-0.631418	-1.059647	-0.684093	1.965775	-1.232622	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	-0.822843	0.538196	1.345852	-1.119670	0.175121	-0.451449	-0.237033	-0.038195	0.803487	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Amount	Time	V1	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V2	V20	V21	V22	V23	V24	V25	V26	V27	V28	V3	V4	V5	V6	V7	V8	V9	scaled_amount	scaled_time	Class
0	147856.0	1.915851	0.665687	-0.884928	3.489039	0.842344	0.315856	0.220146	-0.016990	-1.539009	1.678634	0.800689	0.763875	0.839985	0.430142	-1.024695	1.021889	-1.179907	0.101768	-1.527063	-0.210279	0.289098	0.828991	0.088947	0.716084	0.127340	0.079675	-0.045138	-0.054328	0.00	-0.307413	0.742067	0.0
1	155662.0	-1.928613	4.601506	-7.124053	5.716088	1.026579	-3.189073	-2.261897	1.185096	-4.441942	-6.646154	3.827868	-6.518649	0.251137	-12.456706	-0.649166	-1.283145	-2.718560	-0.085466	-2.097385	0.328796	0.602291	-0.541287	-0.354639	-0.701492	-0.030973	0.034070	0.573393	0.294686	0.77	-0.296653	0.833774	1.0

Table of Contents

Imports¶

Load the data¶

Preprocessing¶

Class Balance¶

Scaling¶

Random Under Sampling¶

Train Test split with stratify for imbalanced data¶

Check for nans before modelling¶

Modelling¶

Data Processing before modelling¶

Decision Tree Classification¶

Feature Importance¶

plot tree using sklearn tree¶

plot using pydotplus and export_graphviz¶

Plot using graphviz Source and IPython display SVG¶

Interactive plot using ipywidgets¶

Run Time¶

Weight	Feature
0.9461	V20
0.0350	V11
0.0098	V2
0.0091	V16
0	V12
0	V13
0	V14
0	V15
0	V17
0	V18
0	V22
0	V21
0	V10
0	V23
0	V1
0	V25
0	V7
0	V9
0	V19
0	scaled_time
… 10 more …