import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.19.5'), ('pandas', '1.1.4'), ('seaborn', '0.11.0'), ('matplotlib', '3.2.1')]


import scipy
from scipy import stats


import IPython
from IPython.display import display


from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


def show_method_attributes(method, ncols=7):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [i for i in dir(method) if i[0].islower()]
    x = [i for i in x if i not in 'os np pd sys time psycopg2'.split()]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')


def json_dump_tofile(myjson,ofile,sort_keys=False):
    """Write json dictionary to a datafile.
    
    Usage:
    myjson = {'num': 5, my_list = [1,2,'apple']}
    json_dump_tofile(myjson, ofile)
    
    """
    import io
    import json

    with io.open(ofile, 'w', encoding='utf8') as fo:
        json_str = json.dumps(myjson,
                          indent=4,
                          sort_keys=sort_keys,
                          separators=(',', ': '),
                          ensure_ascii=False)
        fo.write(str(json_str))


def donut_plot(col, ax, df, text='', colors=['navy', 'crimson'], labels=['non-fraud', 'fraud']):
    """
    This function plots a customized donut chart of the target column (binary task)
    
    Input:
        col: target column of a binary classification task
        ax: matplotlib axis
        df: DataFrame object with the data
        text: the text to be plotted on the center of the donut chart
        colors: list of two colors used to identify the target class
        labels: list of labels to describe the target class
    Output:
        a customized donut chart
        
        
    Reference: https://www.kaggle.com/thiagopanini/improving-fraud-detection-with-resampling
    """
    sizes = df[col].value_counts().values
    center_circle = plt.Circle((0,0), 0.80, color='white')
    ax.pie((sizes[0], sizes[1]), labels=labels, colors=colors, autopct='%1.2f%%')
    ax.add_artist(center_circle)
    kwargs = dict(size=20, fontweight='bold', va='center')
    ax.text(0, 0, text, ha='center', **kwargs)


df = pd.read_csv('../data/raw/creditcard.csv.zip',compression='zip')
print(df.shape)
df.head()

(284807, 31)


sns.catplot(x='Class' , kind='count' , data=df , palette=['r','g'])

<seaborn.axisgrid.FacetGrid at 0x7f89f28f0b90>


fig, ax = plt.subplots(figsize=(7, 7))
text = f'Total: \n\n\n{str(len(df))} samples'
donut_plot(col='Class', ax=ax, df=df, text=text)
ax.set_title('Target Class Balance', size=14)
plt.savefig('../reports/figures/class_balance_donut_plot.png',dpi=300)
plt.show()


df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64


df['Class'].value_counts(normalize=True) * 100

0    99.827251
1     0.172749
Name: Class, dtype: float64


# only 0.17% activities are fraud.
# out of 10_000 transactions, there are 17 cases of fraud.


# shuffle data
df = df.sample(frac=1)

df_low = df.loc[df['Class'] == 1 ]
df_high = df.loc[df['Class'] == 0][:df_low.shape[0]]

df_balanced = pd.concat([df_low, df_high])
df_balanced = df_balanced.sample(frac=1, random_state=random_state)

df_balanced['Class'].value_counts()

1    492
0    492
Name: Class, dtype: int64


sns.catplot(x='Class' , kind='count' , data=df_balanced , palette=['r','g'],)

<seaborn.axisgrid.FacetGrid at 0x7f89dae58850>


df.drop('Class',1).corrwith(df['Class']).sort_values()\
.plot.bar(figsize = (12, 8), title = "Correlation with class",
          fontsize = 12,rot = 90, grid = True,
          color=sns.color_palette('Reds_r',30),ylim=(-0.4,0.4)
       )

plt.savefig('../reports/figures/correlation_with_target.png',dpi=300)


# obs: v17,14,12,10,16,3,7 are negatively correlated with class.
# v2,4,11 have positive correlation with class


f, axes = plt.subplots(ncols=4, figsize=(16,8))

sns.boxplot(x="Class", y="V17", data=df_balanced, palette='Set1', ax=axes[0])
axes[0].set_title('V17 vs Class')

sns.boxplot(x="Class", y="V14", data=df_balanced, palette='Set1', ax=axes[1])
axes[1].set_title('V14 vs Class')


sns.boxplot(x="Class", y="V12", data=df_balanced, palette='Set1', ax=axes[2])
axes[2].set_title('V12 vs Class')


sns.boxplot(x="Class", y="V10", data=df_balanced, palette='Set1', ax=axes[3])
axes[3].set_title('V10 vs Class')

plt.savefig('../reports/figures/negative_correlations_with_target.png',dpi=300)
plt.show()


f, axes = plt.subplots(ncols=4, figsize=(16,8))

sns.boxplot(x="Class", y="V11", data=df_balanced, palette='Set1', ax=axes[0])
axes[0].set_title('V11 vs Class')

sns.boxplot(x="Class", y="V4", data=df_balanced, palette='Set1', ax=axes[1])
axes[1].set_title('V4 vs Class')


sns.boxplot(x="Class", y="V2", data=df_balanced, palette='Set1', ax=axes[2])
axes[2].set_title('V2 vs Class')


sns.boxplot(x="Class", y="V19", data=df_balanced, palette='Set1', ax=axes[3])
axes[3].set_title('V19 vs Class')
plt.savefig('../reports/figures/positive_correlations_with_target.png',dpi=300)
plt.show()


df_corr = df.drop(['Time','Amount','Class'],1).corr()


df_corr_style = df_corr.style.set_precision(2).background_gradient()


with open('../reports/figures/variables_correlation.html','w') as fo:
    fo.write(df_corr_style.render())


df_corr = df.drop(['Time','Amount','Class'],1).corr()

plt.figure(figsize=(20,14))
plt.subplots_adjust(bottom=0.01)
mask = np.zeros_like(df_corr)
mask[np.triu_indices_from(mask)] = True
cbar_ticks = np.linspace(df_corr.min().min(), df_corr[df_corr!=1].max().max(), 20)

g = sns.heatmap(df_corr, cbar=True, annot=True, fmt='.2g',mask=mask,cmap='tab20_r',
           vmin=df_corr.min().min(), vmax=df_corr[df_corr!=1].max().max(),
            cbar_kws=dict(ticks=cbar_ticks,shrink=1.0),
            annot_kws={"size": 8}
           )

g.set_xticklabels(g.get_xmajorticklabels(), fontsize = 16)
g.set_yticklabels(g.get_ymajorticklabels(), fontsize = 16)
cbar = g.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)

plt.tight_layout()
plt.savefig('../reports/figures/correlation_matrix.png',dpi=300);


# 1. to see the large image open the saved image or saved html.
# 2. We can not see the annotations, but can see the colors.
# 3. I have used tab20 color palette it breaks values between 20 parts.

# Note that these variables are obtained after pca transformations,
# there is virtually no correlation, its almost zero.
#
# still, we can see most frequent correlation is 0.005e-14.
# we need to note that this dataset is heavily unblanced, to see the
# more relistic correlation we need to balance the dataset.


df_corr = df_balanced.drop(['Time','Amount','Class'],1).corr()

plt.figure(figsize=(20,14))
plt.subplots_adjust(bottom=0.01)

# make mask
mask = np.zeros_like(df_corr)
mask[np.triu_indices_from(mask)] = True

# heatmap
g = sns.heatmap(df_corr, cbar=True, annot=True, fmt='.2g',mask=mask)

# x and y ticks fontsize
g.set_xticklabels(g.get_xmajorticklabels(), fontsize = 16)
g.set_yticklabels(g.get_ymajorticklabels(), fontsize = 16)

# cbar ticks labelsize
cbar = g.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)

plt.tight_layout()
plt.savefig('../reports/figures/correlation_matrix_balanced.png',dpi=300);


df.hist(figsize=(20,20));
plt.savefig('../reports/figures/all_features_histogram.png',dpi=300)


%%writefile ../models/create_scatter_matrix.py

import pandas as pd
from pandas.plotting import scatter_matrix
import time

t0 = time.time()

df = pd.read_csv('../data/raw/creditcard.csv.zip',compression='zip')
df = df.sample(n=1000)

myfeatures = ['V{}'.format(i) for i in range(1,29)] + ['Amount']

# this takes long time to run, comment it.
scatter_matrix(df[myfeatures], diagonal='kde')
plt.savefig('../reports/figures/scatter_matrix_of_all_features.png',dpi=400)

t1 = time.time() - t0
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(t1,60)))

Writing ../models/create_scatter_matrix.py


# now go to terminal and run the script.


df.info()
# there are no missing values, all values are float except class.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284807 entries, 49089 to 56088
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     284807 non-null  float64
 22  V22     284807 non-null  float64
 23  V23     284807 non-null  float64
 24  V24     284807 non-null  float64
 25  V25     284807 non-null  float64
 26  V26     284807 non-null  float64
 27  V27     284807 non-null  float64
 28  V28     284807 non-null  float64
 29  Amount  284807 non-null  float64
 30  Class   284807 non-null  int64  
dtypes: float64(30), int64(1)
memory usage: 69.5 MB


df[["Time","Amount","Class"]].describe()


df['Time'].describe()

count    284807.000000
mean      94813.859575
std       47488.145955
min           0.000000
25%       54201.500000
50%       84692.000000
75%      139320.500000
max      172792.000000
Name: Time, dtype: float64


df['Time'].max()/60/60/24 # we have data of two days

1.9999074074074075


df['date'] = pd.to_datetime('2013-09-01') + pd.to_timedelta(df['Time'],unit='s')
df[['Time','date']].head()


# create hour column
df['hour'] = df['date'].dt.hour
df[['Time','date','hour']].head()


# count of transactions by hour

fig, ax =plt.subplots(1,2,figsize=(12,8))
sns.countplot(x='hour',data=df.query('Class == 1'),ax=ax[0],palette='tab20')
ax[0].set_title('Fraud Transactions')

sns.countplot(x='hour',data=df.query('Class == 0'),ax=ax[1],palette='tab20')
ax[1].set_title('Normal Transactions')

plt.savefig('../reports/figures/normal_and_fraud_transactions.png',dpi=400)


# obs: if the first transaction happened at 00:00 midnight, most fraudulunt
# activities happen at 21:00 i.e. 9pm to 10pm.

# 21 = 9pm, 22=10pm 23=11pm 24=0am.
#
# If the bank opens at 7AM and first transaction occur at 7AM, then
# peak fraudulent activity happens after 2 hours at 9AM. And another peak
# fraud activity happens after 11 hours at 6PM.


df.plot.scatter(x='Time',y='Amount',c='tomato')

<matplotlib.axes._subplots.AxesSubplot at 0x7f89bd9f7c10>


sns.distplot(df['Time'],fit=scipy.stats.norm);
plt.savefig('../reports/figures/distplot_time.png',dpi=300)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


# time variable is not normalized, we need to normalize it.
from sklearn.preprocessing import StandardScaler

df['norm_Time'] = StandardScaler().fit_transform(df[['Time']])


sns.distplot(df['norm_Time'],fit=scipy.stats.norm)
plt.savefig('../reports/figures/distplot_norm_time.png',dpi=300)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


bins = [0,100,200,300,400,500,1000,5000,30_000]
labels = [1,2,3,4,5,6,7,8]
df['cat_Amount'] = pd.cut(df['Amount'], bins=bins, labels=labels)


df['cat_Amount'].head()

49089     1
154704    1
67247     4
251657    1
201903    1
Name: cat_Amount, dtype: category
Categories (8, int64): [1 < 2 < 3 < 4 < 5 < 6 < 7 < 8]


plt.figure(figsize=(12,4))
ax = sns.countplot(x=df['cat_Amount'])
ax.set_xticklabels([ '0-100','100-200', '200-300','300-400','400-500',
                    '500-1k','1k_5k','5k+']);
plt.title('Categorical Amount Distribution')
plt.savefig('../reports/figures/cat_amount_countplot.png',dpi=300)


plt.figure(figsize=(12,4))
ax = sns.countplot(x=df.loc[df['Class']==0, 'cat_Amount'])
ax.set_xticklabels([ '0-100','100-200', '200-300','300-400','400-500',
                    '500-1k','1k_5k','5k+']);
plt.title('Categorical Amount Distribution for NON-FRAUD Transactions')
plt.savefig('../reports/figures/cat_amount_countplot_non_fraud.png',dpi=300)


plt.figure(figsize=(12,4))
ax = sns.countplot(x=df.loc[df['Class']==1, 'cat_Amount'])
ax.set_xticklabels([ '0-100','100-200', '200-300','300-400','400-500',
                    '500-1k','1k_5k','5k+']);
plt.title('Categorical Amount Distribution for FRAUD Transactions')
plt.savefig('../reports/figures/cat_amount_countplot_fraud.png',dpi=300)


df.groupby('Class').agg({'Amount': ['median', 'mean', 'max']})


df['Amount'].describe()
# there is no negative number.

count    284807.000000
mean         88.349619
std         250.120109
min           0.000000
25%           5.600000
50%          22.000000
75%          77.165000
max       25691.160000
Name: Amount, dtype: float64


df.loc[df.Amount==0,:].head(2)


df.loc[df.Amount==0,:].shape # 1825 transactions are of ZERO dollars.

(1825, 35)


sns.kdeplot(df['Amount'],shade=True);


plt.figure(figsize=(12,4))
sns.kdeplot(df['Amount'],shade=True);
plt.xlim(-1,5000)
plt.xticks(range(0,5000,500))
plt.savefig('../reports/figures/amount_kdeplot.png')
plt.show()


plt.figure(figsize=(12,4))
sns.kdeplot(df.loc[df['Class'] ==0, 'Amount'],shade=True);
plt.xlim(-1,5000)
plt.xticks(range(-500,5000,500))
plt.title('Distribution of Amount for Non-fraud transations.')
plt.savefig('../reports/figures/amount_non_fraud_kdeplot.png')
plt.show()


plt.figure(figsize=(12,4))
sns.kdeplot(df.loc[df['Class']==1, 'Amount'],shade=True);
plt.xlim(-1,500)
plt.xticks(range(-100,500,50))
plt.title('Distribution of Amount for Fraud transations.')
plt.savefig('../reports/figures/amount_fraud_kdeplot.png')
plt.show()


df['norm_Amount'] = StandardScaler().fit_transform(df[['Amount']])


kwargs = dict(alpha=0.9,  bins=30,color='r',sharex=False,layout=(2,1))
df.hist(column='Amount', by='Class',figsize=(12,8),**kwargs);


# most of the fraud amount is <100$.


# the distribution is extremely skewed with a long tail.
# it is better to creat log columns

df['log1p_Amount'] = np.log1p(df['Amount'].values)


df.hist(column='log1p_Amount', by='Class', bins=10, density=True);


v_features = df.filter(regex='V').columns
v_features

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28'],
      dtype='object')


def distplot_for_all_features():

    plt.figure(figsize=(12,28*4))
    gs = matplotlib.gridspec.GridSpec(28, 1)
    for i, cn in enumerate(df[v_features]):
        ax = plt.subplot(gs[i])
        sns.distplot(df[cn][df.Class == 1], bins=50)
        sns.distplot(df[cn][df.Class == 0], bins=50)
        ax.set_xlabel('')
        ax.set_title('histogram of feature: ' + str(cn))
    plt.show()
    
# distplot_for_all_features()


# from above distribution plot, I found some similar distributions.
similar_distribution_features = ['V28','V27','V26','V25','V24','V23','V22',
                                 'V20','V15','V13','V8']


def plot_distribution(df,col,show=False,xlim=None) :
    p = '../reports/figures/distplots'
    if not os.path.isdir(p):
        os.makedirs(p)
    figsize =( 15, 8)
    sns.set_style("ticks")
    s = sns.FacetGrid(df, hue = 'Class',aspect = 2.5,
                      palette ={0 : 'seagreen', 1 :'tomato'})
    
    s.map(sns.kdeplot, col, shade = True, alpha = 0.6)
    
    s.set(xlim=(df[col].min(), df[col].max()))
    s.fig.suptitle(f'Distribution plot of {col}')
    if xlim:
        s.set(xlim=xlim)
        s.fig.suptitle(f'Distribution plot of {col} for given x limit')
        
    s.add_legend()
    s.set_axis_labels(col, 'proportion')
    
    plt.xlabel('Feature Value')

    if xlim:
        plt.savefig(f'{p}/distplot_{col}_selected_xlim.png')
    else:
        plt.savefig(f'{p}/distplot_{col}.png')
        
    if show:
        plt.show()
    plt.close()


%%time
for v in v_features[:]:
    plot_distribution(df,v)


%%time
for v in v_features[:]:
    plot_distribution(df,v,xlim=(-10,10))

CPU times: user 1min 21s, sys: 7.94 s, total: 1min 29s
Wall time: 1min 12s


sns.scatterplot(x = 'Amount', y = 'V1',hue='Class',  data = df);


# mycols = ['V1', 'V2', 'V3', 'V15', 'V18','Amount']
# sns.pairplot(df, hue = 'Class', vars = mycols );


%%capture

def distplot_of_all_features():

    f,ax=plt.subplots(15,2,figsize=(12,60))

    col = ['V%s'%i for i in range(1,29)] + ['Amount','Time']

    for i,feature in enumerate(col):
        sns.distplot(df[df['Class']==1].dropna()[(feature)], ax=ax[i//2,i%2],
                     kde_kws={"color":"black"}, hist=False )

        sns.distplot(df[df['Class']==0].dropna()[(feature)], ax=ax[i//2,i%2],
                     kde_kws={"color":"black"}, hist=False )

        # Get the two lines from the ax[i//2,i%2]es to generate shading
        l1 = ax[i//2,i%2].lines[0]
        l2 = ax[i//2,i%2].lines[1]

        # Get the xy data from the lines so that we can shade
        x1 = l1.get_xydata()[:,0]
        y1 = l1.get_xydata()[:,1]
        x2 = l2.get_xydata()[:,0]
        y2 = l2.get_xydata()[:,1]
        ax[i//2,i%2].fill_between(x2,y2, color="deeppink", alpha=0.6)
        ax[i//2,i%2].fill_between(x1,y1, color="darkturquoise", alpha=0.6)

        #grid
        ax[i//2,i%2].grid(b=True, which='major', color='grey', linewidth=0.3)

        ax[i//2,i%2].set_title('{} by target'.format(feature), fontsize=18)
        ax[i//2,i%2].set_ylabel('count', fontsize=12)
        ax[i//2,i%2].set_xlabel('Modality', fontsize=12)

        #sns.despine(ax[i//2,i%2]=ax[i//2,i%2], left=True)
        ax[i//2,i%2].set_ylabel("frequency", fontsize=12)
        ax[i//2,i%2].set_xlabel(str(feature), fontsize=12)

    plt.tight_layout()
    plt.savefig('../reports/figures/distplots_of_all_features.png',dpi=300)
    plt.close()
    return None
    
distplot_of_all_features()


%%writefile ../models/tsne_visualization.py

# Ref: https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import time

# random state
random_state=100
np.random.seed=random_state
np.random.set_state=random_state

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD

df = pd.read_csv('../data/raw/creditcard.csv.zip',compression='zip')


X = df.drop(['Time','Class'],axis=1)
y = df['Class']


# T-SNE Implementation
t0 = time.time()
X_reduced_tsne = TSNE(n_components=2,
                      random_state=random_state).fit_transform(X.values)
t1 = time.time()
print("T-SNE took {:.2} s".format(t1 - t0))

# PCA Implementation
t0 = time.time()
X_reduced_pca = PCA(n_components=2,
                    random_state=random_state).fit_transform(X.values)
t1 = time.time()
print("PCA took {:.2} s".format(t1 - t0))

# TruncatedSVD
t0 = time.time()
X_reduced_svd = TruncatedSVD(n_components=2, algorithm='randomized',
                             random_state=random_state
                            ).fit_transform(X.values)
t1 = time.time()
print("Truncated SVD took {:.2} s".format(t1 - t0))

#========================== Plotting=============================


f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24,6))
# labels = ['No Fraud', 'Fraud']
f.suptitle('Clusters using Dimensionality Reduction', fontsize=14)


blue_patch = mpatches.Patch(color='#0A0AFF', label='No Fraud')
red_patch = mpatches.Patch(color='#AF0000', label='Fraud')


# t-SNE scatter plot
ax1.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y == 0),
            cmap='coolwarm', label='No Fraud', linewidths=2)
ax1.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y == 1),
            cmap='coolwarm', label='Fraud', linewidths=2)
ax1.set_title('t-SNE', fontsize=14)

ax1.grid(True)

ax1.legend(handles=[blue_patch, red_patch])


# PCA scatter plot
ax2.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], c=(y == 0),
            cmap='coolwarm', label='No Fraud', linewidths=2)
ax2.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], c=(y == 1),
            cmap='coolwarm', label='Fraud', linewidths=2)
ax2.set_title('PCA', fontsize=14)

ax2.grid(True)

ax2.legend(handles=[blue_patch, red_patch])

# TruncatedSVD scatter plot
ax3.scatter(X_reduced_svd[:,0], X_reduced_svd[:,1], c=(y == 0),
            cmap='coolwarm', label='No Fraud', linewidths=2)
ax3.scatter(X_reduced_svd[:,0], X_reduced_svd[:,1], c=(y == 1),
            cmap='coolwarm', label='Fraud', linewidths=2)
ax3.set_title('Truncated SVD', fontsize=14)

ax3.grid(True)

ax3.legend(handles=[blue_patch, red_patch])

plt.savefig('../reports/figures/tsne_visualization.png',dpi=300)

plt.close()

Writing ../models/tsne_visualization.py

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	0.624501	0.066084	0.717293	-0.165946	2.345865	-2.890083	1.109969	-0.121359	-2.261857	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	-0.226487	0.178228	0.507757	-0.287924	-0.631418	-1.059647	-0.684093	1.965775	-1.232622	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	-0.822843	0.538196	1.345852	-1.119670	0.175121	-0.451449	-0.237033	-0.038195	0.803487	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	date
49089	43906.0	2013-09-01 12:11:46
154704	102638.0	2013-09-02 04:30:38
67247	52429.0	2013-09-01 14:33:49
251657	155444.0	2013-09-02 19:10:44
201903	134084.0	2013-09-02 13:14:44

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class	date	hour	norm_Time	cat_Amount
245188	152639.0	2.186937	0.071099	-2.939581	0.074828	1.375154	-0.539542	0.750900	-0.293621	-0.135821	0.481592	-0.283483	-0.351148	-1.573377	1.259548	-0.690830	-0.218176	-0.666161	0.356780	0.700103	-0.285126	0.226725	0.717761	-0.265131	-0.236882	0.759420	0.738085	-0.142842	-0.110182	0.0	0	2013-09-02 18:23:59	18	1.217677	NaN
9559	14327.0	-2.242515	-1.428575	2.616850	2.545410	1.527837	-1.046697	-1.363238	0.411118	1.451829	-0.112644	0.148017	-2.943411	0.078383	1.265566	-1.318025	0.350758	0.572950	-0.240293	-2.166904	-0.124477	0.054946	0.093344	0.551008	0.479935	-0.903352	-0.374247	-0.133911	0.609203	0.0	0	2013-09-01 03:58:47	3	-1.694886	NaN

Table of Contents

Data Description¶

Imports¶

Useful Scripts¶

Load the data¶

Data balancing¶

Correlations¶

Correlation with Target¶

Negative Correlations with Target¶

Positive Correlations with Target¶

Correlations among Features¶

Histograms¶

Scatter plot of sampled data¶

Exploratory Data Analysis¶

data info¶

Temporal variables¶

Continuous variables¶

TSNE visualization¶

	Time	Amount	Class
count	284807.000000	284807.000000	284807.000000
mean	94813.859575	88.349619	0.001727
std	47488.145955	250.120109	0.041527
min	0.000000	0.000000	0.000000
25%	54201.500000	5.600000	0.000000
50%	84692.000000	22.000000	0.000000
75%	139320.500000	77.165000	0.000000
max	172792.000000	25691.160000	1.000000

	Amount
	median	mean	max
Class
0	22.00	88.291022	25691.16
1	9.25	122.211321	2125.87