Data Description

In this project, we will predict the probability that an auto insurance policy holder files a claim. This a binary classification problem.

We have more than half a million records and 59 features (including already calculated features).

binary features: _bin
categorical features: _cat
continuous or ordinal feafures: ind, reg, car, calc
missing values: -1

Fullforms
ind = individual
reg = registration
car = car
calc = calculated

The target columns signifies whether or not a claim was filed for that policy holder.

Imports

In [ ]:
import os
import time
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
time_start_notebook = time.time()
SEED=100
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('matplotlib', '3.2.2')]
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [ ]:
# Google colab
In [ ]:
%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ### mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    ### load the data dir
    dat_dir = 'drive/My Drive/Colab Notebooks/data/'
    sys.path.append(dat_dir)

    ### Image dir
    img_dir = 'drive/My Drive/Colab Notebooks/images/'
    if not os.path.isdir(img_dir): os.makedirs(img_dir)
    sys.path.append(img_dir)

    ### Output dir
    out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
    if not os.path.isdir(out_dir): os.makedirs(out_dir)
    sys.path.append(out_dir)

    ### Also install my custom module
    module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/' 
    sys.path.append(module_dir)
    !cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
    !pip install -e bhishan
    !cd -

    ### update pandas profiling
    ###profile = df_misc.profile_report(html={'style': {'full_width':True}})
    ###profile.to_file(out_dir + 'df_profile.html')
    ###profile.to_widgets() # not supported in Gcolab just use profile
    !pip install -U pandas-profiling # we need restart
    import pandas_profiling


    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
In [ ]:
import sklearn
import pandas_profiling

print([(x.__name__, x.__version__) for x in [sklearn, pandas_profiling]])
[('sklearn', '0.22.2.post1'), ('pandas_profiling', '2.8.0')]
In [ ]:
import bhishan
from bhishan import bp

print(bhishan.__version__)
0.4.0
In [ ]:
%load_ext autoreload
In [ ]:
%autoreload 2

Load the data

In [ ]:
# note: the kaggle zip file also has _macos__ directory and pandas fails to read
# the zip file. To avoid that, unzip the data, then use terminal to zip data.
# good: zip train.csv.zip train.csv
# bad: right click the train and compress (it will create __macos__ directory).
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/'
    'Porto_seguro_safe_driver_prediction/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()
(595212, 59)
Out[ ]:
id target ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin ps_ind_14 ps_ind_15 ps_ind_16_bin ps_ind_17_bin ps_ind_18_bin ps_reg_01 ps_reg_02 ps_reg_03 ps_car_01_cat ps_car_02_cat ps_car_03_cat ps_car_04_cat ps_car_05_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat ps_car_11_cat ps_car_11 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03 ps_calc_04 ps_calc_05 ps_calc_06 ps_calc_07 ps_calc_08 ps_calc_09 ps_calc_10 ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 7 0 2 2 5 1 0 0 1 0 0 0 0 0 0 0 11 0 1 0 0.7 0.2 0.718070 10 1 -1 0 1 4 1 0 0 1 12 2 0.400000 0.883679 0.370810 3.605551 0.6 0.5 0.2 3 1 10 1 10 1 5 9 1 5 8 0 1 1 0 0 1
1 9 0 1 1 7 0 0 0 0 1 0 0 0 0 0 0 3 0 0 1 0.8 0.4 0.766078 11 1 -1 0 -1 11 1 1 2 1 19 3 0.316228 0.618817 0.388716 2.449490 0.3 0.1 0.3 2 1 9 5 8 1 7 3 1 1 9 0 1 1 0 1 0
2 13 0 5 4 9 1 0 0 0 1 0 0 0 0 0 0 12 1 0 0 0.0 0.0 -1.000000 7 1 -1 0 -1 14 1 1 2 1 60 1 0.316228 0.641586 0.347275 3.316625 0.5 0.7 0.1 2 2 9 1 8 2 7 4 2 7 7 0 1 1 0 1 0
3 16 0 0 1 2 0 0 1 0 0 0 0 0 0 0 0 8 1 0 0 0.9 0.2 0.580948 7 1 0 0 1 11 1 1 3 1 104 1 0.374166 0.542949 0.294958 2.000000 0.6 0.9 0.1 2 4 7 1 8 4 2 2 2 4 9 0 0 0 0 0 0
4 17 0 0 2 0 1 0 1 0 0 0 0 0 0 0 0 9 1 0 0 0.7 0.6 0.840759 11 1 -1 0 -1 14 1 1 2 1 82 3 0.316070 0.565832 0.365103 2.000000 0.4 0.6 0.0 2 2 6 3 10 2 12 3 1 1 3 0 0 0 1 1 0
In [ ]:
df = df.drop('id',axis=1)
In [ ]:
"""
Comment about file size:
The data is large, it has 595k records and 59 features.

ps = porto seguro
_bin = binary feature
_cat = categorical feature


continuous or ordinal: ind, reg, car, calc

""";
In [ ]:
target = 'target'
In [ ]:
df_small = df.sample(frac=0.05,random_state=SEED)

Feature Types

In [ ]:
cols_bin = df.filter(regex='_bin$').columns.to_list()
cols_cat = df.filter(regex='_cat$').columns.to_list()
cols_num = [i for i in df.columns if i not in cols_bin+cols_cat+['target'] ]
cols_float = [i for i in cols_num if df[i].dtype== float ]
cols_int = [i for i in cols_num if df[i].dtype== int ]

cols_reg = df.filter(regex='_reg').columns.to_list()
cols_car = df.filter(regex='_car').columns.to_list()
cols_calc = df.filter(regex='_calc').columns.to_list()

print('bin', len(cols_bin), cols_bin)
print('cat', len(cols_cat), cols_cat)
print('num', len(cols_num), cols_num)
print('float', len(cols_float), cols_float)
print('int', len(cols_int), cols_int)
print('reg', len(cols_reg), cols_reg)
print('car', len(cols_car), cols_car)
print('calc', len(cols_calc), cols_calc)
bin 17 ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
cat 14 ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']
num 26 ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14']
float 10 ['ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03']
int 16 ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_car_11', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14']
reg 3 ['ps_reg_01', 'ps_reg_02', 'ps_reg_03']
car 16 ['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']
calc 20 ['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
In [ ]:
df.shape[1], len(cols_reg + cols_car + cols_car), len(cols_bin + cols_cat + cols_num)
Out[ ]:
(58, 35, 57)
In [ ]:
def create_df_meta(df):
    """Create meta data dataframe.

    Reference: https://www.kaggle.com/gpreda/porto-seguro-exploratory-analysis-and-prediction
    """
    data = []
    for col in df.columns:
        # Defining the role
        if col == 'target':
            use = 'target'
        elif col == 'id':
            use = 'id'
        else:
            use = 'input'
            
        # Defining the type
        if 'bin' in col or col == 'target':
            type = 'binary'
        elif 'cat' in col or col == 'id':
            type = 'categorical'
        elif df[col].dtype == float or isinstance(df[col].dtype, float):
            type = 'real'
        elif df[col].dtype == int:
            type = 'integer'
            
        # Initialize preserve to True for all variables except for id
        preserve = True
        if col == 'id':
            preserve = False
        
        # Defining the data type 
        dtype = df[col].dtype

        category = 'none'
        # Defining the category
        if 'ind' in col:
            category = 'individual'
        elif 'reg' in col:
            category = 'registration'
        elif 'car' in col:
            category = 'car'
        elif 'calc' in col:
            category = 'calculated'
        
        # Creating a Dict that contains all the df_meta for the variable
        col_dict = {
            'varname': col,
            'use': use,
            'type': type,
            'preserve': preserve,
            'dtype': dtype,
            'category' : category
        }
        data.append(col_dict)
        
    df_meta = pd.DataFrame(data, columns=['varname', 'use', 'type',
                                          'preserve', 'dtype', 'category'])
    df_meta.set_index('varname', inplace=True)

    return df_meta
In [ ]:
df_meta = create_df_meta(df)
df_meta.head()
Out[ ]:
use type preserve dtype category
varname
target target binary True int64 none
ps_ind_01 input integer True int64 individual
ps_ind_02_cat input categorical True int64 individual
ps_ind_03 input integer True int64 individual
ps_ind_04_cat input categorical True int64 individual
In [ ]:
df_meta.groupby('category').count()[['use']]
Out[ ]:
use
category
calculated 20
car 16
individual 18
none 1
registration 3
In [ ]:
# We have 20 calculated features, 16 car, 18 individual and 3 registration.
In [ ]:
df_meta.groupby(['use','type']).count().reset_index().iloc[:,:3]
Out[ ]:
use type preserve
0 input binary 17
1 input categorical 14
2 input integer 16
3 input real 10
4 target binary 1

Missing Values

In [ ]:
# nans are coded with -1, we need to put back np.nan
# so that we can better impute the nans.

df = df.replace(-1,np.nan)
df.isna().sum().sum()
Out[ ]:
846458
In [ ]:
df_missing = df.bp.missing()
Missing values high threshold = 80%

Number of missing values features: 13
cols_missing_high = []
cols_missing_low = ['ps_car_03_cat', 'ps_car_05_cat', 'ps_reg_03', 'ps_car_14', 'ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat', 'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat', 'ps_car_02_cat', 'ps_car_11', 'ps_car_12']
In [ ]:
cols_drop = ['ps_car_03_cat']
cols_car = [i for i in cols_car if i not in cols_drop]
cols_cat = [i for i in cols_cat if i not in cols_drop]

df_meta.loc[(cols_drop),'keep'] = False
df = df.drop(cols_drop,axis=1)

cols_missing_low = ['ps_car_05_cat', 
                    'ps_reg_03', 'ps_car_14', 
                    'ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat',
                    'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat',
                    'ps_car_02_cat',
                    'ps_car_11', 'ps_car_12']

cols_missing_num = ['ps_reg_03', 'ps_car_14','ps_car_11', 'ps_car_12' ]
for col in cols_missing_num:
    df[col] = df[col].fillna(df[col].mean())

cols_missing_cat = ['ps_car_05_cat', 
                    'ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat',
                    'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat',
                    'ps_car_02_cat']

for col in cols_missing_cat:
    df[col] = df[col].fillna(df[col].mode()[0])

df.isna().sum().sum()
Out[ ]:
0

Class Balance

In [ ]:
df.bp.plot_cat(target,figsize=(8,4))

Features : Registration

ps_reg_01   ps_reg_02   ps_reg_03
In [ ]:
df[cols_reg].head()
Out[ ]:
ps_reg_01 ps_reg_02 ps_reg_03
0 0.7 0.2 0.718070
1 0.8 0.4 0.766078
2 0.0 0.0 0.894047
3 0.9 0.2 0.580948
4 0.7 0.6 0.840759
In [ ]:
df[cols_reg].bp.describe()
Out[ ]:
Feature Type N Count Unique Missing MissingPct Zeros ZerosPct mean std min max 25% 50% 75%
1 ps_reg_02 float64 595212 595212 19 0 0.00 89297 15.00 0.44 0.40 0.00 1.80 0.20 0.30 0.60
0 ps_reg_01 float64 595212 595212 10 0 0.00 21682 3.64 0.61 0.29 0.00 0.90 0.40 0.70 0.90
2 ps_reg_03 float64 595212 595212 5013 0 0.00 0 0.00 0.89 0.31 0.06 4.04 0.67 0.89 1.00
In [ ]:
# reg 01 and 02 are features with denominator 10. eg. 0.1 0.2 0.3 etc.
# reg 01 has only values 0.0 to 0.9 (no 1.0 and only 10 unique values.)
In [ ]:
df[cols_reg].merge(df[target],left_index=True,right_index=True).bp.plot_corr()

Features : Car

car 16 

['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']
In [ ]:
df[cols_car].head()
Out[ ]:
ps_car_01_cat ps_car_02_cat ps_car_04_cat ps_car_05_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat ps_car_11_cat ps_car_11 ps_car_12 ps_car_13 ps_car_14 ps_car_15
0 10.0 1.0 0 1.0 4 1.0 0 0.0 1 12 2.0 0.400000 0.883679 0.370810 3.605551
1 11.0 1.0 0 1.0 11 1.0 1 2.0 1 19 3.0 0.316228 0.618817 0.388716 2.449490
2 7.0 1.0 0 1.0 14 1.0 1 2.0 1 60 1.0 0.316228 0.641586 0.347275 3.316625
3 7.0 1.0 0 1.0 11 1.0 1 3.0 1 104 1.0 0.374166 0.542949 0.294958 2.000000
4 11.0 1.0 0 1.0 14 1.0 1 2.0 1 82 3.0 0.316070 0.565832 0.365103 2.000000
In [ ]:
df[cols_car].bp.describe(sort_col='index')
Out[ ]:
Feature Type N Count Unique Missing MissingPct Zeros ZerosPct mean std min max 25% 50% 75%
0 ps_car_01_cat float64 595212 595212 12 0 0.00 5904 0.99 8.30 2.51 0.00 11.00 7.00 7.00 11.00
1 ps_car_02_cat float64 595212 595212 2 0 0.00 101217 17.01 0.83 0.38 0.00 1.00 1.00 1.00 1.00
2 ps_car_04_cat int64 595212 595212 10 0 0.00 496581 83.43 0.73 2.15 0.00 9.00 0.00 0.00 0.00
3 ps_car_05_cat float64 595212 595212 2 0 0.00 155994 26.21 0.74 0.44 0.00 1.00 0.00 1.00 1.00
4 ps_car_06_cat int64 595212 595212 18 0 0.00 110420 18.55 6.56 5.50 0.00 17.00 1.00 7.00 11.00
5 ps_car_07_cat float64 595212 595212 2 0 0.00 30575 5.14 0.95 0.22 0.00 1.00 1.00 1.00 1.00
6 ps_car_08_cat int64 595212 595212 2 0 0.00 99948 16.79 0.83 0.37 0.00 1.00 1.00 1.00 1.00
7 ps_car_09_cat float64 595212 595212 5 0 0.00 194518 32.68 1.33 0.98 0.00 4.00 0.00 2.00 2.00
8 ps_car_10_cat int64 595212 595212 3 0 0.00 4857 0.82 0.99 0.09 0.00 2.00 1.00 1.00 1.00
9 ps_car_11_cat int64 595212 595212 104 0 0.00 0 0.00 62.22 33.01 1.00 104.00 32.00 65.00 93.00
10 ps_car_11 float64 595212 595212 5 0 0.00 25983 4.37 2.35 0.83 0.00 3.00 2.00 3.00 3.00
11 ps_car_12 float64 595212 595212 184 0 0.00 0 0.00 0.38 0.06 0.10 1.26 0.32 0.37 0.40
12 ps_car_13 float64 595212 595212 70482 0 0.00 0 0.00 0.81 0.22 0.25 3.72 0.67 0.77 0.91
13 ps_car_14 float64 595212 595212 850 0 0.00 0 0.00 0.37 0.04 0.11 0.64 0.35 0.37 0.40
14 ps_car_15 float64 595212 595212 15 0 0.00 14186 2.38 3.07 0.73 0.00 3.74 2.83 3.32 3.61
In [ ]:
# Note:
# ps_car_12 are approxly square roots (divided by 10) of natural numbers.
# ps_car_15 are square roots of natural numbers.
In [ ]:
sample = df.sample(frac=0.05,random_state=SEED)
var = ['ps_car_12', 'ps_car_15']
sample = sample[var].copy()
sample['ps_car_12_sq'] = sample['ps_car_12'] ** 2 * 10
sample['ps_car_15_sq'] = sample['ps_car_15'] ** 2

sample.head()
Out[ ]:
ps_car_12 ps_car_15 ps_car_12_sq ps_car_15_sq
251823 0.400000 3.464102 1.600 12.0
201702 0.374166 3.464102 1.400 12.0
151280 0.374166 3.464102 1.400 12.0
293035 0.374166 3.316625 1.400 11.0
583646 0.399500 0.000000 1.596 0.0
In [ ]:
sample[['ps_car_12_sq','ps_car_15_sq']].nunique()
Out[ ]:
ps_car_12_sq    107
ps_car_15_sq     15
dtype: int64
In [ ]:
df[cols_car].merge(df[target],left_index=True,
                   right_index=True).bp.plot_corr(xrot=90,figsize=(18,8))

Features : Calculated

calc 20 

['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 
'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 
'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 
'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 
'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 
'ps_calc_16_bin', 'ps_calc_17_bin', 
'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
In [ ]:
df[cols_calc].head()
Out[ ]:
ps_calc_01 ps_calc_02 ps_calc_03 ps_calc_04 ps_calc_05 ps_calc_06 ps_calc_07 ps_calc_08 ps_calc_09 ps_calc_10 ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 0.6 0.5 0.2 3 1 10 1 10 1 5 9 1 5 8 0 1 1 0 0 1
1 0.3 0.1 0.3 2 1 9 5 8 1 7 3 1 1 9 0 1 1 0 1 0
2 0.5 0.7 0.1 2 2 9 1 8 2 7 4 2 7 7 0 1 1 0 1 0
3 0.6 0.9 0.1 2 4 7 1 8 4 2 2 2 4 9 0 0 0 0 0 0
4 0.4 0.6 0.0 2 2 6 3 10 2 12 3 1 1 3 0 0 0 1 1 0
In [ ]:
df[cols_calc].bp.describe()
Out[ ]:
Feature Type N Count Unique Missing MissingPct Zeros ZerosPct mean std min max 25% 50% 75%
14 ps_calc_15_bin int64 595212 595212 2 0 0.00 522342 87.76 0.12 0.33 0.00 1.00 0.00 0.00 0.00
19 ps_calc_20_bin int64 595212 595212 2 0 0.00 503955 84.67 0.15 0.36 0.00 1.00 0.00 0.00 0.00
17 ps_calc_18_bin int64 595212 595212 2 0 0.00 424278 71.28 0.29 0.45 0.00 1.00 0.00 0.00 1.00
18 ps_calc_19_bin int64 595212 595212 2 0 0.00 387469 65.10 0.35 0.48 0.00 1.00 0.00 0.00 1.00
16 ps_calc_17_bin int64 595212 595212 2 0 0.00 265356 44.58 0.55 0.50 0.00 1.00 0.00 1.00 1.00
15 ps_calc_16_bin int64 595212 595212 2 0 0.00 221514 37.22 0.63 0.48 0.00 1.00 0.00 1.00 1.00
11 ps_calc_12 int64 595212 595212 11 0 0.00 141001 23.69 1.44 1.20 0.00 10.00 1.00 1.00 2.00
4 ps_calc_05 int64 595212 595212 7 0 0.00 61226 10.29 1.89 1.13 0.00 6.00 1.00 2.00 3.00
0 ps_calc_01 float64 595212 595212 10 0 0.00 59780 10.04 0.45 0.29 0.00 0.90 0.20 0.50 0.70
1 ps_calc_02 float64 595212 595212 10 0 0.00 59618 10.02 0.45 0.29 0.00 0.90 0.20 0.40 0.70
2 ps_calc_03 float64 595212 595212 10 0 0.00 59318 9.97 0.45 0.29 0.00 0.90 0.20 0.50 0.70
8 ps_calc_09 int64 595212 595212 8 0 0.00 34479 5.79 2.34 1.25 0.00 7.00 1.00 2.00 3.00
12 ps_calc_13 int64 595212 595212 14 0 0.00 33658 5.65 2.87 1.69 0.00 13.00 2.00 3.00 4.00
3 ps_calc_04 int64 595212 595212 6 0 0.00 23912 4.02 2.37 1.12 0.00 5.00 2.00 2.00 3.00
6 ps_calc_07 int64 595212 595212 10 0 0.00 15290 2.57 3.01 1.41 0.00 9.00 2.00 3.00 4.00
10 ps_calc_11 int64 595212 595212 20 0 0.00 2461 0.41 5.44 2.33 0.00 19.00 4.00 5.00 7.00
13 ps_calc_14 int64 595212 595212 24 0 0.00 310 0.05 7.54 2.75 0.00 23.00 6.00 7.00 9.00
9 ps_calc_10 int64 595212 595212 26 0 0.00 139 0.02 8.43 2.90 0.00 25.00 6.00 8.00 10.00
5 ps_calc_06 int64 595212 595212 11 0 0.00 1 0.00 7.69 1.33 0.00 10.00 7.00 8.00 9.00
7 ps_calc_08 int64 595212 595212 11 0 0.00 0 0.00 9.23 1.46 2.00 12.00 8.00 9.00 10.00

Feature: Real numbers

In [ ]:
df_meta.head()
Out[ ]:
use type preserve dtype category keep
varname
target target binary True int64 none NaN
ps_ind_01 input integer True int64 individual NaN
ps_ind_02_cat input categorical True int64 individual NaN
ps_ind_03 input integer True int64 individual NaN
ps_ind_04_cat input categorical True int64 individual NaN
In [ ]:
# cols_real_all = df.select_dtypes('float').columns.to_list()
# cols_real = [ i for i in cols_real_all if not i.endswith('_cat')]
# len(cols_real), cols_real

# note: ps_car_11 was integer but after imputing nan, it became float
In [ ]:
cols_real = df_meta.query(""" type == 'real' and preserve == True """).index.to_list()
print(len(cols_real), cols_real)
10 ['ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03']
In [ ]:
df.bp.compare_kde(cols_real,target,3,4)
In [ ]:
# Useful feature that shows different kde plots for target predictions are
# reg01 reg02 reg03
# car13 car14 car15
# other real number features are not much that useful.
In [ ]:
df.sample(frac=0.1,random_state=SEED).bp.plot_num('ps_reg_02')
In [ ]:
df.sample(frac=0.1,random_state=SEED).bp.plot_num('ps_car_15')
In [ ]:
df.sample(frac=0.1,random_state=SEED).bp.plot_num_cat('ps_reg_02','target')
In [ ]:
df.sample(frac=0.1,random_state=SEED).bp.plot_num_cat('ps_car_15','target')
In [ ]:
df[cols_real].merge(df[target],left_index=True,
                   right_index=True).bp.plot_corr(xrot=90,figsize=(18,8))
In [ ]:
# Look at highly correlated features
In [ ]:
df_high_corr = df[cols_real].bp.corr_high(thr=0.4)
cols_high_corr = ['ps_reg_02', 'ps_reg_03', 'ps_car_14', 'ps_car_12', 'ps_reg_01', 'ps_car_13', 'ps_car_15']
cols_high_corr1 = ['ps_reg_03', 'ps_car_12', 'ps_car_14', 'ps_car_15', 'ps_reg_02', 'ps_car_14']
cols_high_corr2 = ['ps_reg_02', 'ps_car_13', 'ps_car_12', 'ps_car_13', 'ps_reg_01', 'ps_car_13']
cols_high_corr_drop = ['ps_car_14', 'ps_car_15', 'ps_reg_03']
feature1 feature2 corr
0 ps_reg_03 ps_reg_02 0.703121
1 ps_car_12 ps_car_13 0.672014
2 ps_car_14 ps_car_12 0.577472
3 ps_car_15 ps_car_13 0.529519
4 ps_reg_02 ps_reg_01 0.471027
5 ps_car_14 ps_car_13 0.434238
In [ ]:
cols_high_corr = ['ps_reg_02', 'ps_reg_03', 'ps_car_14', 'ps_car_12',
                'ps_reg_01', 'ps_car_13', 'ps_car_15'] + [target]

sample = df_small[cols_high_corr]
sample.head(2)
Out[ ]:
ps_reg_02 ps_reg_03 ps_car_14 ps_car_12 ps_reg_01 ps_car_13 ps_car_15 target
251823 0.5 1.056823 0.392046 0.400000 0.9 0.943683 3.464102 0
201702 0.3 0.685565 0.388716 0.374166 0.9 0.743887 3.464102 0
In [ ]:
sns.pairplot(sample,  hue='target', palette = 'Set1', diag_kind='kde')
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7f56d05a1128>
In [ ]:
cols_high_corr1 = ['ps_reg_03', 'ps_car_12', 'ps_car_14', 'ps_car_15', 'ps_reg_02', 'ps_car_14']
cols_high_corr2 = ['ps_reg_02', 'ps_car_13', 'ps_car_12', 'ps_car_13', 'ps_reg_01', 'ps_car_13']

print(cols_high_corr1)
print(cols_high_corr2)
['ps_reg_03', 'ps_car_12', 'ps_car_14', 'ps_car_15', 'ps_reg_02', 'ps_car_14']
['ps_reg_02', 'ps_car_13', 'ps_car_12', 'ps_car_13', 'ps_reg_01', 'ps_car_13']
In [ ]:
plt.figure(figsize=(12,8))
sns.lmplot(x='ps_reg_02',y='ps_reg_03',data=sample,
           hue=target, palette='Set1',
           scatter_kws={'alpha':0.3})
plt.xlim(-0.1,2)
plt.tight_layout()
plt.show()
<Figure size 864x576 with 0 Axes>
In [ ]:
sample.bp.regplot_binn(cols_high_corr1,cols_high_corr2,target,2,3)
In [ ]:
# reg02 and reg03 has almost same relation with target, one of them may be dropped.
# car12 and car14 has almost same relation with target, one of them may be dropped.

Features: Binary

In [ ]:
cols_bin = df.filter(regex='_bin$').columns.to_list()
print(len(cols_bin), cols_bin)
17 ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
In [ ]:
df[cols_bin].head(2)
Out[ ]:
ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin ps_ind_16_bin ps_ind_17_bin ps_ind_18_bin ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1
1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 0
In [ ]:
df.bp.plot_cat_cat(cols_bin[0],target)
==================================================
Feature: **ps_ind_06_bin**
Overall Count: 
    0: 60.63%
    1: 39.37%

Total  **target_1** distribution:
    0: 69.17%
    1: 30.83%

Per ps_ind_06_bin  **target_1** distribution:
    0: 4.16%
    1: 2.85%
In [ ]:
(df[cols_bin]
 .apply(lambda x: x.value_counts(normalize=True))
 .T.plot(kind='bar', stacked=True,color=['tomato','teal'])
)
plt.xlabel('Binary Features')
plt.tight_layout()
plt.title('Proportion of Binary Features')
plt.show()
In [ ]:
# df.bp.plot_cat_stacked(cols_bin)
In [ ]:
len(cols_bin)
Out[ ]:
17
In [ ]:
df.bp.compare_kde(cols_bin,target,6,3,figsize=(24,18))

Features: Categorical

In [ ]:
cols_cat = df.filter(regex='_cat$').columns.to_list()
print(len(cols_cat), cols_cat)
13 ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']
In [ ]:
df[cols_cat].head(2)
Out[ ]:
ps_ind_02_cat ps_ind_04_cat ps_ind_05_cat ps_car_01_cat ps_car_02_cat ps_car_04_cat ps_car_05_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat ps_car_11_cat
0 2.0 1.0 0.0 10.0 1.0 0 1.0 4 1.0 0 0.0 1 12
1 1.0 0.0 0.0 11.0 1.0 0 1.0 11 1.0 1 2.0 1 19
In [ ]:
df[cols_cat].nunique().sort_values()
Out[ ]:
ps_ind_04_cat      2
ps_car_02_cat      2
ps_car_05_cat      2
ps_car_07_cat      2
ps_car_08_cat      2
ps_car_10_cat      3
ps_ind_02_cat      4
ps_car_09_cat      5
ps_ind_05_cat      7
ps_car_04_cat     10
ps_car_01_cat     12
ps_car_06_cat     18
ps_car_11_cat    104
dtype: int64
In [ ]:
cols_cat_small = df[cols_cat].nunique().sort_values().loc[lambda x: x<=10].index.to_list()
print(len(cols_cat_small), cols_cat_small)
10 ['ps_ind_04_cat', 'ps_car_02_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_10_cat', 'ps_ind_02_cat', 'ps_car_09_cat', 'ps_ind_05_cat', 'ps_car_04_cat']
In [ ]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
In [ ]:
df.bp.countplot(cols_cat_small,3,4)
In [ ]:
df[cols_cat_small].head(2)
Out[ ]:
ps_ind_04_cat ps_car_02_cat ps_car_05_cat ps_car_07_cat ps_car_08_cat ps_car_10_cat ps_ind_02_cat ps_car_09_cat ps_ind_05_cat ps_car_04_cat
0 1.0 1.0 1.0 1.0 0 1 2.0 0.0 0.0 0
1 0.0 1.0 1.0 1.0 1 1 1.0 2.0 0.0 0

Dataframe Statistics Profiles

We can look at the overview of the data such as histogram, missing values, correlation, skewness using pandas_profiling module. Bear in mind that it may take long time to produce results. So saving them in a output file and only producing them once is a good way to go.

In [ ]:
import pandas_profiling
In [ ]:
out_dir = '../images/Porto/'
In [ ]:
if ENV_COLAB:
    out_dir = 'drive/My Drive/Colab Notebooks/outputs'
    out_dir = out_dir + '/Porto/'
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)

out_dir
Out[ ]:
'drive/My Drive/Colab Notebooks/outputs/Porto/'
In [ ]:
df.head(2)
Out[ ]:
target ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin ps_ind_14 ps_ind_15 ps_ind_16_bin ps_ind_17_bin ps_ind_18_bin ps_reg_01 ps_reg_02 ps_reg_03 ps_car_01_cat ps_car_02_cat ps_car_04_cat ps_car_05_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat ps_car_11_cat ps_car_11 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03 ps_calc_04 ps_calc_05 ps_calc_06 ps_calc_07 ps_calc_08 ps_calc_09 ps_calc_10 ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
0 0 2 2.0 5 1.0 0.0 0 1 0 0 0 0 0 0 0 11 0 1 0 0.7 0.2 0.718070 10.0 1.0 0 1.0 4 1.0 0 0.0 1 12 2.0 0.400000 0.883679 0.370810 3.605551 0.6 0.5 0.2 3 1 10 1 10 1 5 9 1 5 8 0 1 1 0 0 1
1 0 1 1.0 7 0.0 0.0 0 0 1 0 0 0 0 0 0 3 0 0 1 0.8 0.4 0.766078 11.0 1.0 0 1.0 11 1.0 1 2.0 1 19 3.0 0.316228 0.618817 0.388716 2.449490 0.3 0.1 0.3 2 1 9 5 8 1 7 3 1 1 9 0 1 1 0 1 0
In [ ]:
# profile = df_misc.profile_report(html={'style': {'full_width':True}})
# profile.to_file(out_dir + 'df_profile.html')
In [ ]:
dfs = [df[cols_bin + [target]],
       df[cols_cat + [target]],
       df[cols_num + [target]],
       ]
ofiles = ['df_bin', 'df_cat', 'df_num']
ofiles = [out_dir + i +'_profile.html' for i in ofiles]

ofiles[0]
Out[ ]:
'drive/My Drive/Colab Notebooks/outputs/Porto/df_bin_profile.html'
In [ ]:
profile = dfs[0].profile_report(html={'style': {'full_width':True}})
# profile
In [ ]:
profile = dfs[0].profile_report(explorative=True)
# profile
In [ ]:
%%time
for dfx,ofile in zip(dfs,ofiles):
    dfx = dfx.merge(df[target],left_index=True,right_index=True)
    if not os.path.isfile(ofile):
         profile = dfx.profile_report(html={'style': {'full_width':True}})
         profile.to_file(ofile)
CPU times: user 44.5 ms, sys: 2.98 ms, total: 47.5 ms
Wall time: 48.7 ms

Time Taken

In [ ]:
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-3f1957611a72> in <module>()
----> 1 time_taken = time.time() - time_start_notebook
      2 h,m = divmod(time_taken,60*60)
      3 print('Time taken to run whole notebook: {:.0f} hr '      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

NameError: name 'time_start_notebook' is not defined
In [ ]: