import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
# random state
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.16.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/")
from bhishan import bp
df = pd.read_csv('../data/raw/creditcard.csv.zip',compression='zip')
print(df.shape)
df.head()
(284807, 31)
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | -0.551600 | -0.617801 | -0.991390 | -0.311169 | 1.468177 | -0.470401 | 0.207971 | 0.025791 | 0.403993 | 0.251412 | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | 1.612727 | 1.065235 | 0.489095 | -0.143772 | 0.635558 | 0.463917 | -0.114805 | -0.183361 | -0.145783 | -0.069083 | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | 0.207643 | 0.624501 | 0.066084 | 0.717293 | -0.165946 | 2.345865 | -2.890083 | 1.109969 | -0.121359 | -2.261857 | 0.524980 | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | -0.054952 | -0.226487 | 0.178228 | 0.507757 | -0.287924 | -0.631418 | -1.059647 | -0.684093 | 1.965775 | -1.232622 | -0.208038 | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | 0.753074 | -0.822843 | 0.538196 | 1.345852 | -1.119670 | 0.175121 | -0.451449 | -0.237033 | -0.038195 | 0.803487 | 0.408542 | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
target = 'Class'
df[target].value_counts()
0 284315 1 492 Name: Class, dtype: int64
df[target].value_counts(normalize=True)*1000
0 998.272514 1 1.727486 Name: Class, dtype: float64
# RobustScaler is less prone to outliers.
from sklearn.preprocessing import StandardScaler, RobustScaler
scaler = RobustScaler()
df['scaled_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = scaler.fit_transform(df['Time'].values.reshape(-1,1))
Cons:
# without removing outliers
n = df[target].value_counts().values[-1]
df_under = (df.groupby(target)
.apply(lambda x: x.sample(n,random_state=SEED))
.reset_index(drop=True)
)
df_under[target].value_counts()
1 492 0 492 Name: Class, dtype: int64
df.shape, df_under.shape
# out of 284k samples, we now have 984 samples for undersampling
# we have lost 283k samples and have only 1k samples
# this is a lot of information losss, but still I will test the
# classifiers with this undersampling method.
#
# Later, I will use oversampling methods to do the modelling.
((284807, 33), (984, 33))
from sklearn.model_selection import train_test_split
df.columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class', 'scaled_amount', 'scaled_time'], dtype='object')
(Xtrain_under, Xtest_under,
ytrain_under, ytest_under) = \
train_test_split(df_under.drop([target],1),
df_under[target],
random_state=SEED,
test_size=0.2,
#stratify=df_under[target] # do no use stratify here.
)
print(df.shape, Xtrain_under.shape, Xtest_under.shape)
columns = df.columns.difference([target]).values.tolist() + [target]
df_train_under = pd.DataFrame(data=np.c_[Xtrain_under,
ytrain_under],
columns=columns)
df_test_under = pd.DataFrame(data=np.c_[Xtest_under
,ytest_under],
columns=columns)
print(df.shape, df_train_under.shape, df_test_under.shape)
df_train_under.head(2)
(284807, 33) (787, 32) (197, 32) (284807, 33) (787, 33) (197, 33)
Amount | Time | V1 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V2 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | scaled_amount | scaled_time | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 147856.0 | 1.915851 | 0.665687 | -0.884928 | 3.489039 | 0.842344 | 0.315856 | 0.220146 | -0.016990 | -1.539009 | 1.678634 | 0.800689 | 0.763875 | 0.839985 | 0.430142 | -1.024695 | 1.021889 | -1.179907 | 0.101768 | -1.527063 | -0.210279 | 0.289098 | 0.828991 | 0.088947 | 0.716084 | 0.127340 | 0.079675 | -0.045138 | -0.054328 | 0.00 | -0.307413 | 0.742067 | 0.0 |
1 | 155662.0 | -1.928613 | 4.601506 | -7.124053 | 5.716088 | 1.026579 | -3.189073 | -2.261897 | 1.185096 | -4.441942 | -6.646154 | 3.827868 | -6.518649 | 0.251137 | -12.456706 | -0.649166 | -1.283145 | -2.718560 | -0.085466 | -2.097385 | 0.328796 | 0.602291 | -0.541287 | -0.354639 | -0.701492 | -0.030973 | 0.034070 | 0.573393 | 0.294686 | 0.77 | -0.296653 | 0.833774 | 1.0 |
df_train_under[target].value_counts()
1.0 401 0.0 386 Name: Class, dtype: int64
df_test_under[target].value_counts()
0.0 106 1.0 91 Name: Class, dtype: int64
for x in [df, df_under,
df_train_under, df_test_under,
]:
print(x.isnull().sum().sum())
0 0 0 0
df_under.columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class', 'scaled_amount', 'scaled_time'], dtype='object')
df_train_under.columns
Index(['Amount', 'Time', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'scaled_amount', 'scaled_time', 'Class'], dtype='object')
</div>
features_with_log = df_under.columns.difference(
['Amount','Time','Class']).values.tolist()
features = features_with_log
print(features)
['V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'scaled_amount', 'scaled_time']
# numpy arrays
Xtrain = df_train_under[features].values
Xtest = df_test_under[features].values
ytrain = df_train_under[target].values.ravel()
ytest = df_test_under[target].values.ravel()
Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape
((787, 30), (787,), (197, 30), (197,))
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
model = DecisionTreeClassifier(max_depth=3,random_state=SEED,
min_samples_leaf=20,min_samples_split=20)
model.fit(Xtrain,ytrain)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=20, min_samples_split=20, min_weight_fraction_leaf=0.0, presort=False, random_state=100, splitter='best')
df_imp = pd.Series(model.feature_importances_, index=features)
df_imp.sort_values(ascending=False,inplace=True)
df_imp
V20 0.946110 V11 0.035036 V2 0.009760 V16 0.009094 scaled_time 0.000000 scaled_amount 0.000000 V10 0.000000 V12 0.000000 V13 0.000000 V14 0.000000 V15 0.000000 V17 0.000000 V18 0.000000 V19 0.000000 V21 0.000000 V22 0.000000 V23 0.000000 V24 0.000000 V25 0.000000 V26 0.000000 V27 0.000000 V28 0.000000 V3 0.000000 V4 0.000000 V5 0.000000 V6 0.000000 V7 0.000000 V8 0.000000 V9 0.000000 V1 0.000000 dtype: float64
fig,ax = plt.subplots(figsize=(14,8))
df_imp.plot.bar(ax=ax)
add_text_barplot(ax)
from yellowbrick.model_selection import FeatureImportances
viz = FeatureImportances(model,labels=features,)
viz.fit(Xtrain, ytrain)
viz.show()
<matplotlib.axes._subplots.AxesSubplot at 0x11bb055f8>
from eli5 import show_weights
show_weights(model,feature_names=features)
Weight | Feature |
---|---|
0.9461 | V20 |
0.0350 | V11 |
0.0098 | V2 |
0.0091 | V16 |
0 | V12 |
0 | V13 |
0 | V14 |
0 | V15 |
0 | V17 |
0 | V18 |
0 | V22 |
0 | V21 |
0 | V10 |
0 | V23 |
0 | V1 |
0 | V25 |
0 | V7 |
0 | V9 |
0 | V19 |
0 | scaled_time |
… 10 more … |
from sklearn import tree
fig, ax = plt.subplots(figsize=(24,12))
tree.plot_tree(model,feature_names=features,class_names=['NotFraud','Fraud'],
proportion=True,filled=True, ax=ax,fontsize=14);
plt.savefig('../images/decision_tree.png',dpi=300)
import pydotplus
from IPython.display import Image
dot_data = tree.export_graphviz(model, out_file=None,
feature_names=features,
class_names=['Not Fraud','Fraud'])
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
# Show graph
Image(graph.create_png())
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
graph = Source(tree.export_graphviz(model,
out_file=None,
feature_names=features,
class_names=['Not Fraud','Fraud'],
filled = True))
display(SVG(graph.pipe(format='svg')))
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from ipywidgets import interactive
def plot_tree(crit, split, depth, min_split, min_leaf=0.2):
# define the model
model = DecisionTreeClassifier(random_state=SEED,
criterion=crit,
splitter=split,
max_depth=depth,
min_samples_split=min_split,
min_samples_leaf=min_leaf)
# fit the model
model.fit(Xtrain, ytrain)
# dot file
graph = Source(tree.export_graphviz(model,
out_file=None,
feature_names=features,
class_names=['Not Fraud', 'Fraud'],
filled=True))
# display the image
display(SVG(graph.pipe(format='svg')))
return model
inter=interactive(plot_tree, crit = ["gini", "entropy"],
split = ["best", "random"],
depth=[1,2,3,4],
min_split=(0.1,1),
min_leaf=(0.1,0.5))
display(inter)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 8 secs
import subprocess
subprocess.call(['python', '-m', 'nbconvert', '*.ipynb'])
0
!mv *.html ../html/
!rm -rf catboost_info
rm: catboost_info: No such file or directory