Kernel Author:
Bhishan Poudel, Ph.D Astrophysics .

Data Description¶

In this project, we will predict the probability that an auto insurance policy holder files a claim. This a binary classification problem.

We have more than half a million records and 59 features (including already calculated features).

binary features: _bin
categorical features: _cat
continuous or ordinal feafures: ind, reg, car, calc
missing values: -1

Fullforms
ind = individual
reg = registration
car = car
calc = calculated

The target columns signifies whether or not a claim was filed for that policy holder.

Imports¶

import os
import time
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
time_start_notebook = time.time()
SEED=100
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('matplotlib', '3.2.2')]

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

# Google colab

%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ### mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    ### load the data dir
    dat_dir = 'drive/My Drive/Colab Notebooks/data/'
    sys.path.append(dat_dir)

    ### Image dir
    img_dir = 'drive/My Drive/Colab Notebooks/images/'
    if not os.path.isdir(img_dir): os.makedirs(img_dir)
    sys.path.append(img_dir)

    ### Output dir
    out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
    if not os.path.isdir(out_dir): os.makedirs(out_dir)
    sys.path.append(out_dir)

    ### Also install my custom module
    module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/' 
    sys.path.append(module_dir)
    !cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
    !pip install -e bhishan
    !cd -

    ### update pandas profiling
    ###profile = df_misc.profile_report(html={'style': {'full_width':True}})
    ###profile.to_file(out_dir + 'df_profile.html')
    ###profile.to_widgets() # not supported in Gcolab just use profile
    !pip install -U pandas-profiling # we need restart
    import pandas_profiling


    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········

import sklearn
import pandas_profiling

print([(x.__name__, x.__version__) for x in [sklearn, pandas_profiling]])

[('sklearn', '0.22.2.post1'), ('pandas_profiling', '2.8.0')]

import bhishan
from bhishan import bp

print(bhishan.__version__)

0.4.0

%load_ext autoreload

%autoreload 2

Load the data¶

# note: the kaggle zip file also has _macos__ directory and pandas fails to read
# the zip file. To avoid that, unzip the data, then use terminal to zip data.
# good: zip train.csv.zip train.csv
# bad: right click the train and compress (it will create __macos__ directory).
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/'
    'Porto_seguro_safe_driver_prediction/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()

(595212, 59)

df = df.drop('id',axis=1)

"""
Comment about file size:
The data is large, it has 595k records and 59 features.

ps = porto seguro
_bin = binary feature
_cat = categorical feature


continuous or ordinal: ind, reg, car, calc

""";

target = 'target'

df_small = df.sample(frac=0.05,random_state=SEED)

Feature Types¶

cols_bin = df.filter(regex='_bin$').columns.to_list()
cols_cat = df.filter(regex='_cat$').columns.to_list()
cols_num = [i for i in df.columns if i not in cols_bin+cols_cat+['target'] ]
cols_float = [i for i in cols_num if df[i].dtype== float ]
cols_int = [i for i in cols_num if df[i].dtype== int ]

cols_reg = df.filter(regex='_reg').columns.to_list()
cols_car = df.filter(regex='_car').columns.to_list()
cols_calc = df.filter(regex='_calc').columns.to_list()

print('bin', len(cols_bin), cols_bin)
print('cat', len(cols_cat), cols_cat)
print('num', len(cols_num), cols_num)
print('float', len(cols_float), cols_float)
print('int', len(cols_int), cols_int)
print('reg', len(cols_reg), cols_reg)
print('car', len(cols_car), cols_car)
print('calc', len(cols_calc), cols_calc)

bin 17 ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
cat 14 ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']
num 26 ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14']
float 10 ['ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03']
int 16 ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_car_11', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14']
reg 3 ['ps_reg_01', 'ps_reg_02', 'ps_reg_03']
car 16 ['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']
calc 20 ['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']

df.shape[1], len(cols_reg + cols_car + cols_car), len(cols_bin + cols_cat + cols_num)

(58, 35, 57)

def create_df_meta(df):
    """Create meta data dataframe.

    Reference: https://www.kaggle.com/gpreda/porto-seguro-exploratory-analysis-and-prediction
    """
    data = []
    for col in df.columns:
        # Defining the role
        if col == 'target':
            use = 'target'
        elif col == 'id':
            use = 'id'
        else:
            use = 'input'
            
        # Defining the type
        if 'bin' in col or col == 'target':
            type = 'binary'
        elif 'cat' in col or col == 'id':
            type = 'categorical'
        elif df[col].dtype == float or isinstance(df[col].dtype, float):
            type = 'real'
        elif df[col].dtype == int:
            type = 'integer'
            
        # Initialize preserve to True for all variables except for id
        preserve = True
        if col == 'id':
            preserve = False
        
        # Defining the data type 
        dtype = df[col].dtype

        category = 'none'
        # Defining the category
        if 'ind' in col:
            category = 'individual'
        elif 'reg' in col:
            category = 'registration'
        elif 'car' in col:
            category = 'car'
        elif 'calc' in col:
            category = 'calculated'
        
        # Creating a Dict that contains all the df_meta for the variable
        col_dict = {
            'varname': col,
            'use': use,
            'type': type,
            'preserve': preserve,
            'dtype': dtype,
            'category' : category
        }
        data.append(col_dict)
        
    df_meta = pd.DataFrame(data, columns=['varname', 'use', 'type',
                                          'preserve', 'dtype', 'category'])
    df_meta.set_index('varname', inplace=True)

    return df_meta

df_meta = create_df_meta(df)
df_meta.head()

df_meta.groupby('category').count()[['use']]

# We have 20 calculated features, 16 car, 18 individual and 3 registration.

df_meta.groupby(['use','type']).count().reset_index().iloc[:,:3]

Missing Values¶

# nans are coded with -1, we need to put back np.nan
# so that we can better impute the nans.

df = df.replace(-1,np.nan)
df.isna().sum().sum()

846458

df_missing = df.bp.missing()

Missing values high threshold = 80%

Number of missing values features: 13
cols_missing_high = []
cols_missing_low = ['ps_car_03_cat', 'ps_car_05_cat', 'ps_reg_03', 'ps_car_14', 'ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat', 'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat', 'ps_car_02_cat', 'ps_car_11', 'ps_car_12']

cols_drop = ['ps_car_03_cat']
cols_car = [i for i in cols_car if i not in cols_drop]
cols_cat = [i for i in cols_cat if i not in cols_drop]

df_meta.loc[(cols_drop),'keep'] = False
df = df.drop(cols_drop,axis=1)

cols_missing_low = ['ps_car_05_cat', 
                    'ps_reg_03', 'ps_car_14', 
                    'ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat',
                    'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat',
                    'ps_car_02_cat',
                    'ps_car_11', 'ps_car_12']

cols_missing_num = ['ps_reg_03', 'ps_car_14','ps_car_11', 'ps_car_12' ]
for col in cols_missing_num:
    df[col] = df[col].fillna(df[col].mean())

cols_missing_cat = ['ps_car_05_cat', 
                    'ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat',
                    'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat',
                    'ps_car_02_cat']

for col in cols_missing_cat:
    df[col] = df[col].fillna(df[col].mode()[0])

df.isna().sum().sum()

0

Class Balance¶

df.bp.plot_cat(target,figsize=(8,4))

Features : Registration¶

ps_reg_01   ps_reg_02   ps_reg_03

df[cols_reg].head()

df[cols_reg].bp.describe()

# reg 01 and 02 are features with denominator 10. eg. 0.1 0.2 0.3 etc.
# reg 01 has only values 0.0 to 0.9 (no 1.0 and only 10 unique values.)

df[cols_reg].merge(df[target],left_index=True,right_index=True).bp.plot_corr()

Features : Car¶

car 16 

['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']

df[cols_car].head()

df[cols_car].bp.describe(sort_col='index')

# Note:
# ps_car_12 are approxly square roots (divided by 10) of natural numbers.
# ps_car_15 are square roots of natural numbers.

sample = df.sample(frac=0.05,random_state=SEED)
var = ['ps_car_12', 'ps_car_15']
sample = sample[var].copy()
sample['ps_car_12_sq'] = sample['ps_car_12'] ** 2 * 10
sample['ps_car_15_sq'] = sample['ps_car_15'] ** 2

sample.head()

sample[['ps_car_12_sq','ps_car_15_sq']].nunique()

ps_car_12_sq    107
ps_car_15_sq     15
dtype: int64

df[cols_car].merge(df[target],left_index=True,
                   right_index=True).bp.plot_corr(xrot=90,figsize=(18,8))

Features : Calculated¶

calc 20 

['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 
'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 
'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 
'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 
'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 
'ps_calc_16_bin', 'ps_calc_17_bin', 
'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']

df[cols_calc].head()

df[cols_calc].bp.describe()

Feature: Real numbers¶

df_meta.head()

# cols_real_all = df.select_dtypes('float').columns.to_list()
# cols_real = [ i for i in cols_real_all if not i.endswith('_cat')]
# len(cols_real), cols_real

# note: ps_car_11 was integer but after imputing nan, it became float

cols_real = df_meta.query(""" type == 'real' and preserve == True """).index.to_list()
print(len(cols_real), cols_real)

10 ['ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03']

df.bp.compare_kde(cols_real,target,3,4)

# Useful feature that shows different kde plots for target predictions are
# reg01 reg02 reg03
# car13 car14 car15
# other real number features are not much that useful.

df.sample(frac=0.1,random_state=SEED).bp.plot_num('ps_reg_02')

df.sample(frac=0.1,random_state=SEED).bp.plot_num('ps_car_15')

df.sample(frac=0.1,random_state=SEED).bp.plot_num_cat('ps_reg_02','target')

df.sample(frac=0.1,random_state=SEED).bp.plot_num_cat('ps_car_15','target')

df[cols_real].merge(df[target],left_index=True,
                   right_index=True).bp.plot_corr(xrot=90,figsize=(18,8))

# Look at highly correlated features

df_high_corr = df[cols_real].bp.corr_high(thr=0.4)

cols_high_corr = ['ps_reg_02', 'ps_reg_03', 'ps_car_14', 'ps_car_12', 'ps_reg_01', 'ps_car_13', 'ps_car_15']
cols_high_corr1 = ['ps_reg_03', 'ps_car_12', 'ps_car_14', 'ps_car_15', 'ps_reg_02', 'ps_car_14']
cols_high_corr2 = ['ps_reg_02', 'ps_car_13', 'ps_car_12', 'ps_car_13', 'ps_reg_01', 'ps_car_13']
cols_high_corr_drop = ['ps_car_14', 'ps_car_15', 'ps_reg_03']

cols_high_corr = ['ps_reg_02', 'ps_reg_03', 'ps_car_14', 'ps_car_12',
                'ps_reg_01', 'ps_car_13', 'ps_car_15'] + [target]

sample = df_small[cols_high_corr]
sample.head(2)

sns.pairplot(sample,  hue='target', palette = 'Set1', diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x7f56d05a1128>

cols_high_corr1 = ['ps_reg_03', 'ps_car_12', 'ps_car_14', 'ps_car_15', 'ps_reg_02', 'ps_car_14']
cols_high_corr2 = ['ps_reg_02', 'ps_car_13', 'ps_car_12', 'ps_car_13', 'ps_reg_01', 'ps_car_13']

print(cols_high_corr1)
print(cols_high_corr2)

['ps_reg_03', 'ps_car_12', 'ps_car_14', 'ps_car_15', 'ps_reg_02', 'ps_car_14']
['ps_reg_02', 'ps_car_13', 'ps_car_12', 'ps_car_13', 'ps_reg_01', 'ps_car_13']

plt.figure(figsize=(12,8))
sns.lmplot(x='ps_reg_02',y='ps_reg_03',data=sample,
           hue=target, palette='Set1',
           scatter_kws={'alpha':0.3})
plt.xlim(-0.1,2)
plt.tight_layout()
plt.show()

<Figure size 864x576 with 0 Axes>

sample.bp.regplot_binn(cols_high_corr1,cols_high_corr2,target,2,3)

# reg02 and reg03 has almost same relation with target, one of them may be dropped.
# car12 and car14 has almost same relation with target, one of them may be dropped.

Features: Binary¶

cols_bin = df.filter(regex='_bin$').columns.to_list()
print(len(cols_bin), cols_bin)

17 ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']

df[cols_bin].head(2)

df.bp.plot_cat_cat(cols_bin[0],target)

==================================================
Feature: **ps_ind_06_bin**
Overall Count: 
    0: 60.63%
    1: 39.37%

Total  **target_1** distribution:
    0: 69.17%
    1: 30.83%

Per ps_ind_06_bin  **target_1** distribution:
    0: 4.16%
    1: 2.85%

(df[cols_bin]
 .apply(lambda x: x.value_counts(normalize=True))
 .T.plot(kind='bar', stacked=True,color=['tomato','teal'])
)
plt.xlabel('Binary Features')
plt.tight_layout()
plt.title('Proportion of Binary Features')
plt.show()

# df.bp.plot_cat_stacked(cols_bin)

len(cols_bin)

17

df.bp.compare_kde(cols_bin,target,6,3,figsize=(24,18))

Features: Categorical¶

cols_cat = df.filter(regex='_cat$').columns.to_list()
print(len(cols_cat), cols_cat)

13 ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']

df[cols_cat].head(2)

df[cols_cat].nunique().sort_values()

ps_ind_04_cat      2
ps_car_02_cat      2
ps_car_05_cat      2
ps_car_07_cat      2
ps_car_08_cat      2
ps_car_10_cat      3
ps_ind_02_cat      4
ps_car_09_cat      5
ps_ind_05_cat      7
ps_car_04_cat     10
ps_car_01_cat     12
ps_car_06_cat     18
ps_car_11_cat    104
dtype: int64

cols_cat_small = df[cols_cat].nunique().sort_values().loc[lambda x: x<=10].index.to_list()
print(len(cols_cat_small), cols_cat_small)

10 ['ps_ind_04_cat', 'ps_car_02_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_10_cat', 'ps_ind_02_cat', 'ps_car_09_cat', 'ps_ind_05_cat', 'ps_car_04_cat']

%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

df.bp.countplot(cols_cat_small,3,4)

df[cols_cat_small].head(2)

Dataframe Statistics Profiles¶

We can look at the overview of the data such as histogram, missing values, correlation, skewness using pandas_profiling module. Bear in mind that it may take long time to produce results. So saving them in a output file and only producing them once is a good way to go.

import pandas_profiling

out_dir = '../images/Porto/'

if ENV_COLAB:
    out_dir = 'drive/My Drive/Colab Notebooks/outputs'
    out_dir = out_dir + '/Porto/'
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)

out_dir

'drive/My Drive/Colab Notebooks/outputs/Porto/'

df.head(2)

# profile = df_misc.profile_report(html={'style': {'full_width':True}})
# profile.to_file(out_dir + 'df_profile.html')

dfs = [df[cols_bin + [target]],
       df[cols_cat + [target]],
       df[cols_num + [target]],
       ]
ofiles = ['df_bin', 'df_cat', 'df_num']
ofiles = [out_dir + i +'_profile.html' for i in ofiles]

ofiles[0]

'drive/My Drive/Colab Notebooks/outputs/Porto/df_bin_profile.html'

profile = dfs[0].profile_report(html={'style': {'full_width':True}})
# profile

profile = dfs[0].profile_report(explorative=True)
# profile

%%time
for dfx,ofile in zip(dfs,ofiles):
    dfx = dfx.merge(df[target],left_index=True,right_index=True)
    if not os.path.isfile(ofile):
         profile = dfx.profile_report(html={'style': {'full_width':True}})
         profile.to_file(ofile)

CPU times: user 44.5 ms, sys: 2.98 ms, total: 47.5 ms
Wall time: 48.7 ms

Time Taken¶

time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-3f1957611a72> in <module>()
----> 1 time_taken = time.time() - time_start_notebook
      2 h,m = divmod(time_taken,60*60)
      3 print('Time taken to run whole notebook: {:.0f} hr '      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

NameError: name 'time_start_notebook' is not defined

	Feature	Type	N	Count	Unique	Zeros	ZerosPct	mean	std	min	max	25%	50%	75%
1	ps_reg_02	float64	595212	595212	19	89297	15.00	0.44	0.40	0.00	1.80	0.20	0.30	0.60
0	ps_reg_01	float64	595212	595212	10	21682	3.64	0.61	0.29	0.00	0.90	0.40	0.70	0.90
2	ps_reg_03	float64	595212	595212	5013	0	0.00	0.89	0.31	0.06	4.04	0.67	0.89	1.00

	Feature	Type	N	Count	Unique	Zeros	ZerosPct	mean	std	min	max	25%	50%	75%
0	ps_car_01_cat	float64	595212	595212	12	5904	0.99	8.30	2.51	0.00	11.00	7.00	7.00	11.00
1	ps_car_02_cat	float64	595212	595212	2	101217	17.01	0.83	0.38	0.00	1.00	1.00	1.00	1.00
2	ps_car_04_cat	int64	595212	595212	10	496581	83.43	0.73	2.15	0.00	9.00	0.00	0.00	0.00
3	ps_car_05_cat	float64	595212	595212	2	155994	26.21	0.74	0.44	0.00	1.00	0.00	1.00	1.00
4	ps_car_06_cat	int64	595212	595212	18	110420	18.55	6.56	5.50	0.00	17.00	1.00	7.00	11.00
5	ps_car_07_cat	float64	595212	595212	2	30575	5.14	0.95	0.22	0.00	1.00	1.00	1.00	1.00
6	ps_car_08_cat	int64	595212	595212	2	99948	16.79	0.83	0.37	0.00	1.00	1.00	1.00	1.00
7	ps_car_09_cat	float64	595212	595212	5	194518	32.68	1.33	0.98	0.00	4.00	0.00	2.00	2.00
8	ps_car_10_cat	int64	595212	595212	3	4857	0.82	0.99	0.09	0.00	2.00	1.00	1.00	1.00
9	ps_car_11_cat	int64	595212	595212	104	0	0.00	62.22	33.01	1.00	104.00	32.00	65.00	93.00
10	ps_car_11	float64	595212	595212	5	25983	4.37	2.35	0.83	0.00	3.00	2.00	3.00	3.00
11	ps_car_12	float64	595212	595212	184	0	0.00	0.38	0.06	0.10	1.26	0.32	0.37	0.40
12	ps_car_13	float64	595212	595212	70482	0	0.00	0.81	0.22	0.25	3.72	0.67	0.77	0.91
13	ps_car_14	float64	595212	595212	850	0	0.00	0.37	0.04	0.11	0.64	0.35	0.37	0.40
14	ps_car_15	float64	595212	595212	15	14186	2.38	3.07	0.73	0.00	3.74	2.83	3.32	3.61

	ps_car_12	ps_car_15	ps_car_12_sq	ps_car_15_sq
251823	0.400000	3.464102	1.600	12.0
201702	0.374166	3.464102	1.400	12.0
151280	0.374166	3.464102	1.400	12.0
293035	0.374166	3.316625	1.400	11.0
583646	0.399500	0.000000	1.596	0.0

	Feature	Type	N	Count	Unique	Zeros	ZerosPct	mean	std	min	max	25%	50%	75%
14	ps_calc_15_bin	int64	595212	595212	2	522342	87.76	0.12	0.33	0.00	1.00	0.00	0.00	0.00
19	ps_calc_20_bin	int64	595212	595212	2	503955	84.67	0.15	0.36	0.00	1.00	0.00	0.00	0.00
17	ps_calc_18_bin	int64	595212	595212	2	424278	71.28	0.29	0.45	0.00	1.00	0.00	0.00	1.00
18	ps_calc_19_bin	int64	595212	595212	2	387469	65.10	0.35	0.48	0.00	1.00	0.00	0.00	1.00
16	ps_calc_17_bin	int64	595212	595212	2	265356	44.58	0.55	0.50	0.00	1.00	0.00	1.00	1.00
15	ps_calc_16_bin	int64	595212	595212	2	221514	37.22	0.63	0.48	0.00	1.00	0.00	1.00	1.00
11	ps_calc_12	int64	595212	595212	11	141001	23.69	1.44	1.20	0.00	10.00	1.00	1.00	2.00
4	ps_calc_05	int64	595212	595212	7	61226	10.29	1.89	1.13	0.00	6.00	1.00	2.00	3.00
0	ps_calc_01	float64	595212	595212	10	59780	10.04	0.45	0.29	0.00	0.90	0.20	0.50	0.70
1	ps_calc_02	float64	595212	595212	10	59618	10.02	0.45	0.29	0.00	0.90	0.20	0.40	0.70
2	ps_calc_03	float64	595212	595212	10	59318	9.97	0.45	0.29	0.00	0.90	0.20	0.50	0.70
8	ps_calc_09	int64	595212	595212	8	34479	5.79	2.34	1.25	0.00	7.00	1.00	2.00	3.00
12	ps_calc_13	int64	595212	595212	14	33658	5.65	2.87	1.69	0.00	13.00	2.00	3.00	4.00
3	ps_calc_04	int64	595212	595212	6	23912	4.02	2.37	1.12	0.00	5.00	2.00	2.00	3.00
6	ps_calc_07	int64	595212	595212	10	15290	2.57	3.01	1.41	0.00	9.00	2.00	3.00	4.00
10	ps_calc_11	int64	595212	595212	20	2461	0.41	5.44	2.33	0.00	19.00	4.00	5.00	7.00
13	ps_calc_14	int64	595212	595212	24	310	0.05	7.54	2.75	0.00	23.00	6.00	7.00	9.00
9	ps_calc_10	int64	595212	595212	26	139	0.02	8.43	2.90	0.00	25.00	6.00	8.00	10.00
5	ps_calc_06	int64	595212	595212	11	1	0.00	7.69	1.33	0.00	10.00	7.00	8.00	9.00
7	ps_calc_08	int64	595212	595212	11	0	0.00	9.23	1.46	2.00	12.00	8.00	9.00	10.00

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	ps_ind_15	ps_ind_16_bin	ps_ind_17_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_01_cat	ps_car_02_cat	ps_car_03_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	7	2	2	5	1	0	1	0	11	0	1	0	0.7	0.2	0.718070	10	1	-1	1	4	1	0	0	1	12	2	0.400000	0.883679	0.370810	3.605551	0.6	0.5	0.2	3	1	10	1	10	1	5	9	1	5	8	1	1	0	0	1
1	9	1	1	7	0	0	0	1	3	0	0	1	0.8	0.4	0.766078	11	1	-1	-1	11	1	1	2	1	19	3	0.316228	0.618817	0.388716	2.449490	0.3	0.1	0.3	2	1	9	5	8	1	7	3	1	1	9	1	1	0	1	0
2	13	5	4	9	1	0	0	1	12	1	0	0	0.0	0.0	-1.000000	7	1	-1	-1	14	1	1	2	1	60	1	0.316228	0.641586	0.347275	3.316625	0.5	0.7	0.1	2	2	9	1	8	2	7	4	2	7	7	1	1	0	1	0
3	16	0	1	2	0	1	0	0	8	1	0	0	0.9	0.2	0.580948	7	1	0	1	11	1	1	3	1	104	1	0.374166	0.542949	0.294958	2.000000	0.6	0.9	0.1	2	4	7	1	8	4	2	2	2	4	9	0	0	0	0	0
4	17	0	2	0	1	1	0	0	9	1	0	0	0.7	0.6	0.840759	11	1	-1	-1	14	1	1	2	1	82	3	0.316070	0.565832	0.365103	2.000000	0.4	0.6	0.0	2	2	6	3	10	2	12	3	1	1	3	0	0	1	1	0

	use	type	preserve	dtype	category
varname
target	target	binary	True	int64	none
ps_ind_01	input	integer	True	int64	individual
ps_ind_02_cat	input	categorical	True	int64	individual
ps_ind_03	input	integer	True	int64	individual
ps_ind_04_cat	input	categorical	True	int64	individual

	ps_reg_01	ps_reg_02	ps_reg_03
0	0.7	0.2	0.718070
1	0.8	0.4	0.766078
2	0.0	0.0	0.894047
3	0.9	0.2	0.580948
4	0.7	0.6	0.840759

	ps_car_01_cat	ps_car_02_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15
0	10.0	1.0	1.0	4	1.0	0	0.0	1	12	2.0	0.400000	0.883679	0.370810	3.605551
1	11.0	1.0	1.0	11	1.0	1	2.0	1	19	3.0	0.316228	0.618817	0.388716	2.449490
2	7.0	1.0	1.0	14	1.0	1	2.0	1	60	1.0	0.316228	0.641586	0.347275	3.316625
3	7.0	1.0	1.0	11	1.0	1	3.0	1	104	1.0	0.374166	0.542949	0.294958	2.000000
4	11.0	1.0	1.0	14	1.0	1	2.0	1	82	3.0	0.316070	0.565832	0.365103	2.000000

	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	0.6	0.5	0.2	3	1	10	1	10	1	5	9	1	5	8	1	1	0	0	1
1	0.3	0.1	0.3	2	1	9	5	8	1	7	3	1	1	9	1	1	0	1	0
2	0.5	0.7	0.1	2	2	9	1	8	2	7	4	2	7	7	1	1	0	1	0
3	0.6	0.9	0.1	2	4	7	1	8	4	2	2	2	4	9	0	0	0	0	0
4	0.4	0.6	0.0	2	2	6	3	10	2	12	3	1	1	3	0	0	1	1	0

	feature1	feature2	corr
0	ps_reg_03	ps_reg_02	0.703121
1	ps_car_12	ps_car_13	0.672014
2	ps_car_14	ps_car_12	0.577472
3	ps_car_15	ps_car_13	0.529519
4	ps_reg_02	ps_reg_01	0.471027
5	ps_car_14	ps_car_13	0.434238

	ps_reg_02	ps_reg_03	ps_car_14	ps_car_12	ps_reg_01	ps_car_13	ps_car_15	target
251823	0.5	1.056823	0.392046	0.400000	0.9	0.943683	3.464102	0
201702	0.3	0.685565	0.388716	0.374166	0.9	0.743887	3.464102	0

	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	0.6	0.5	0.2	3	1	10	1	10	1	5	9	1	5	8	1	1	0	0	1
1	0.3	0.1	0.3	2	1	9	5	8	1	7	3	1	1	9	1	1	0	1	0
2	0.5	0.7	0.1	2	2	9	1	8	2	7	4	2	7	7	1	1	0	1	0
3	0.6	0.9	0.1	2	4	7	1	8	4	2	2	2	4	9	0	0	0	0	0
4	0.4	0.6	0.0	2	2	6	3	10	2	12	3	1	1	3	0	0	1	1	0

	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
0	0.6	0.5	0.2	3	1	10	1	10	1	5	9	1	5	8	1	1	0	0	1
1	0.3	0.1	0.3	2	1	9	5	8	1	7	3	1	1	9	1	1	0	1	0
2	0.5	0.7	0.1	2	2	9	1	8	2	7	4	2	7	7	1	1	0	1	0
3	0.6	0.9	0.1	2	4	7	1	8	4	2	2	2	4	9	0	0	0	0	0
4	0.4	0.6	0.0	2	2	6	3	10	2	12	3	1	1	3	0	0	1	1	0