Description

In this project we will use multiclass classification to predict one of the 8 possible value of Response.

The data is taken from Kaggle Prudential Life Insurance Project.

About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.

Here category 8 has the highest counts, I assume it the quote that is granted.

Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)

Features:

1 Misc             : Age ht wt bmi              4
2 Product Info     : Product_Info_1 to 7        7
3 Employment Info  : Employment_Info_1 to 6     6
4 Insured Info     : InsuredInfo_1 to 7         7
5 Insurance History: Insurance_History_1 to 9   9
6 Family History   : Family_Hist_1 to 5         5
7 Medical History  : Medical_History_1 to 41    41
8 Medical Keywords : Medical_Keyword_1 to 48    48
Target: Response                                1
ID    : ID                                      1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)

Imports

In [278]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True,font_scale=1.5)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline


# random state
SEED = 100
time_start_notebook = time.time()

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

import scipy
from scipy import stats
import IPython
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
In [279]:
# Google colab
In [280]:
%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ### mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    ### load the data dir
    dat_dir = 'drive/My Drive/Colab Notebooks/data/'
    sys.path.append(dat_dir)

    ### Image dir
    img_dir = 'drive/My Drive/Colab Notebooks/images/'
    if not os.path.isdir(img_dir): os.makedirs(img_dir)
    sys.path.append(img_dir)

    ### Output dir
    out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
    if not os.path.isdir(out_dir): os.makedirs(out_dir)
    sys.path.append(out_dir)

    ### Also install my custom module
    module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/' 
    sys.path.append(module_dir)
    !cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
    !pip install -e bhishan
    !cd -

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.
In [281]:
if ENV_COLAB:
    out_dir = 'drive/My Drive/Colab Notebooks/outputs'
else:
    out_dir = '../outputs'
    
out_dir = out_dir + '/Prudential/'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

out_dir
Out[281]:
'../outputs/Prudential/'
In [282]:
import gc
from tqdm import tqdm
import functools # funtools.lru_cache
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch
In [283]:
import bhishan
from bhishan import bp
In [284]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999
In [285]:
%load_ext autoreload
%autoreload 2
%load_ext watermark
%watermark -a "Bhishan Poudel" -dvm
%watermark -iv
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2020-06-22 

CPython 3.7.7
IPython 7.13.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
pandas     1.0.3
IPython    7.13.0
scipy      1.4.1
json       2.0.9
autopep8   1.5.2
matplotlib 3.2.1
bhishan    0.3.1
seaborn    0.10.1
numpy      1.18.4

Load the data

In [286]:
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()
(59381, 128)
Out[286]:
Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4 Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht Wt BMI Employment_Info_1 Employment_Info_2 Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6 InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5 InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2 Insurance_History_3 Insurance_History_4 Insurance_History_5 Insurance_History_7 Insurance_History_8 Insurance_History_9 Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5 Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4 Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8 Medical_History_9 Medical_History_10 Medical_History_11 Medical_History_12 Medical_History_13 Medical_History_14 Medical_History_15 Medical_History_16 Medical_History_17 Medical_History_18 Medical_History_19 Medical_History_20 Medical_History_21 Medical_History_22 Medical_History_23 Medical_History_24 Medical_History_25 Medical_History_26 Medical_History_27 Medical_History_28 Medical_History_29 Medical_History_30 Medical_History_31 Medical_History_32 Medical_History_33 Medical_History_34 Medical_History_35 Medical_History_36 Medical_History_37 Medical_History_38 Medical_History_39 Medical_History_40 Medical_History_41 Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48 Response
0 2 1 D3 10 0.076923 2 1 1 0.641791 0.581818 0.148536 0.323008 0.028 12 1 0.0 3 NaN 1 2 6 3 1 2 1 1 1 3 1 0.000667 1 1 2 2 NaN 0.598039 NaN 0.526786 4.0 112 2 1 1 3 2 2 1 NaN 3 2 3 3 240.0 3 3 1 1 2 1 2 3 NaN 1 3 3 1 3 2 3 NaN 1 3 1 2 2 1 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8
1 5 1 A1 26 0.076923 2 3 1 0.059701 0.600000 0.131799 0.272288 0.000 1 3 0.0 2 0.0018 1 2 6 3 1 2 1 2 1 3 1 0.000133 1 3 2 2 0.188406 NaN 0.084507 NaN 5.0 412 2 1 1 3 2 2 1 NaN 3 2 3 3 0.0 1 3 1 1 2 1 2 3 NaN 1 3 3 1 3 2 3 NaN 3 1 1 2 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4
2 6 1 E1 26 0.076923 2 3 1 0.029851 0.745455 0.288703 0.428780 0.030 9 1 0.0 2 0.0300 1 2 8 3 1 1 1 2 1 1 3 NaN 3 2 3 3 0.304348 NaN 0.225352 NaN 10.0 3 2 2 1 3 2 2 2 NaN 3 2 3 3 NaN 1 3 1 1 2 1 2 3 NaN 2 2 3 1 3 2 3 NaN 3 3 1 3 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8
3 7 1 D4 10 0.487179 2 3 1 0.164179 0.672727 0.205021 0.352438 0.042 9 1 0.0 3 0.2000 2 2 8 3 1 2 1 2 1 1 3 NaN 3 2 3 3 0.420290 NaN 0.352113 NaN 0.0 350 2 2 1 3 2 2 2 NaN 3 2 3 3 NaN 1 3 1 1 2 2 2 3 NaN 1 3 3 1 3 2 3 NaN 3 3 1 2 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8
4 8 1 D2 26 0.230769 2 3 1 0.417910 0.654545 0.234310 0.424046 0.027 9 1 0.0 2 0.0500 1 2 6 3 1 2 1 2 1 1 3 NaN 3 2 3 2 0.463768 NaN 0.408451 NaN NaN 162 2 2 1 3 2 2 2 NaN 3 2 3 3 NaN 1 3 1 1 2 1 2 3 NaN 2 2 3 1 3 2 3 NaN 3 3 1 3 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8
In [287]:
target = 'Response'

Train-Test split with stratify

In [288]:
from sklearn.model_selection import train_test_split

target = 'Response'

df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],
    test_size=0.2, random_state=SEED, stratify=df[target])

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_orig, ser_ytrain_orig,
    test_size=0.2, random_state=SEED, stratify=ser_ytrain_orig)

print(f"df             : {df.shape}")

print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")

print(f"\ndf_Xtrain      : {df_Xtrain.shape}")
print(f"ser_ytrain     : {ser_ytrain.shape}")

print(f"\ndf_Xvalid      : {df_Xvalid.shape}")
print(f"ser_yvalid     : {ser_yvalid.shape}")

print(f"\ndf_Xtest       : {df_Xtest.shape}")
print(f"ser_ytest      : {ser_ytest.shape}")

df_Xtrain_orig.head(2)
df             : (59381, 128)

df_Xtrain_orig : (47504, 127)
ser_ytrain_orig: (47504,)

df_Xtrain      : (38003, 127)
ser_ytrain     : (38003,)

df_Xvalid      : (9501, 127)
ser_yvalid     : (9501,)

df_Xtest       : (11877, 127)
ser_ytest      : (11877,)
Out[288]:
Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4 Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht Wt BMI Employment_Info_1 Employment_Info_2 Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6 InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5 InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2 Insurance_History_3 Insurance_History_4 Insurance_History_5 Insurance_History_7 Insurance_History_8 Insurance_History_9 Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5 Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4 Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8 Medical_History_9 Medical_History_10 Medical_History_11 Medical_History_12 Medical_History_13 Medical_History_14 Medical_History_15 Medical_History_16 Medical_History_17 Medical_History_18 Medical_History_19 Medical_History_20 Medical_History_21 Medical_History_22 Medical_History_23 Medical_History_24 Medical_History_25 Medical_History_26 Medical_History_27 Medical_History_28 Medical_History_29 Medical_History_30 Medical_History_31 Medical_History_32 Medical_History_33 Medical_History_34 Medical_History_35 Medical_History_36 Medical_History_37 Medical_History_38 Medical_History_39 Medical_History_40 Medical_History_41 Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48
616 840 1 A3 26 0.230769 2 3 1 0.059701 0.727273 0.225941 0.341911 0.03 9 1 0.0 2 0.015 1 2 3 3 1 1 1 2 1 1 3 NaN 3 2 3 3 0.376812 NaN 0.366197 NaN NaN 162 2 2 1 3 2 2 2 NaN 3 2 3 3 NaN 1 3 1 1 2 1 2 3 NaN 1 3 3 1 3 2 3 NaN 3 3 1 2 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3239 4322 1 D3 26 0.230769 2 3 1 0.417910 0.654545 0.209205 0.376858 0.09 14 1 0.0 2 0.600 1 2 8 3 1 2 1 2 1 3 1 0.000333 1 3 2 2 0.710145 NaN NaN 0.419643 3.0 413 2 1 1 3 2 2 1 NaN 3 2 3 3 163.0 1 3 1 1 2 2 2 1 NaN 1 3 3 1 1 2 3 NaN 3 3 1 2 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Target Distribution

In [289]:
df.bp.plot_cat(target)
In [290]:
df.bp.plot_pareto(target)
In [291]:
"""
We can see class 8 has the highest distribution (33%), we can assume this as
the clean and accepted policy of insurance.

Other classes 1-7 can be assumed rejected or accepted with some conditions.


""";
In [292]:
df_desc = df.bp.get_column_descriptions(transpose=True)
df_desc
Out[292]:
47 69 61 52 37 35 34 29 36 17 38 15 12 91 87 116 113 96 122 92 98 124 83 80 95 97 90 119 86 114 121 109 107 105 84 94 104 117 123 85 82 99 106 102 125 112 110 111 108 88 100 79 120 81 126 118 89 115 103 101 93 4 8 9 10 11 0 1 2 3 5 6 7 13 14 16 18 19 20 21 22 23 24 25 26 27 28 30 31 32 33 39 40 41 42 43 44 45 46 48 49 50 51 53 54 55 56 57 58 59 60 62 63 64 65 66 67 68 70 71 72 73 74 75 76 77 78 127
Feature Medical_History_10 Medical_History_32 Medical_History_24 Medical_History_15 Family_Hist_5 Family_Hist_3 Family_Hist_2 Insurance_History_5 Family_Hist_4 Employment_Info_6 Medical_History_1 Employment_Info_4 Employment_Info_1 Medical_Keyword_13 Medical_Keyword_9 Medical_Keyword_38 Medical_Keyword_35 Medical_Keyword_18 Medical_Keyword_44 Medical_Keyword_14 Medical_Keyword_20 Medical_Keyword_46 Medical_Keyword_5 Medical_Keyword_2 Medical_Keyword_17 Medical_Keyword_19 Medical_Keyword_12 Medical_Keyword_41 Medical_Keyword_8 Medical_Keyword_36 Medical_Keyword_43 Medical_Keyword_31 Medical_Keyword_29 Medical_Keyword_27 Medical_Keyword_6 Medical_Keyword_16 Medical_Keyword_26 Medical_Keyword_39 Medical_Keyword_45 Medical_Keyword_7 Medical_Keyword_4 Medical_Keyword_21 Medical_Keyword_28 Medical_Keyword_24 Medical_Keyword_47 Medical_Keyword_34 Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_30 Medical_Keyword_10 Medical_Keyword_22 Medical_Keyword_1 Medical_Keyword_42 Medical_Keyword_3 Medical_Keyword_48 Medical_Keyword_40 Medical_Keyword_11 Medical_Keyword_37 Medical_Keyword_25 Medical_Keyword_23 Medical_Keyword_15 Product_Info_4 Ins_Age Ht Wt BMI Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_5 Product_Info_6 Product_Info_7 Employment_Info_2 Employment_Info_3 Employment_Info_5 InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5 InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2 Insurance_History_3 Insurance_History_4 Insurance_History_7 Insurance_History_8 Insurance_History_9 Family_Hist_1 Medical_History_2 Medical_History_3 Medical_History_4 Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8 Medical_History_9 Medical_History_11 Medical_History_12 Medical_History_13 Medical_History_14 Medical_History_16 Medical_History_17 Medical_History_18 Medical_History_19 Medical_History_20 Medical_History_21 Medical_History_22 Medical_History_23 Medical_History_25 Medical_History_26 Medical_History_27 Medical_History_28 Medical_History_29 Medical_History_30 Medical_History_31 Medical_History_33 Medical_History_34 Medical_History_35 Medical_History_36 Medical_History_37 Medical_History_38 Medical_History_39 Medical_History_40 Medical_History_41 Response
Type float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 float64 float64 float64 float64 float64 int64 int64 object int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64
Count 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381 59381
Unique 103 95 227 241 90 90 68 2265 68 992 171 871 1936 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1491 65 39 300 3256 59381 2 19 34 2 2 3 36 2 2 3 2 11 2 2 2 2 2 3 3 3 3 3 3 3 579 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 3 3 2 3 3 3 8
Missing 58824 58274 55580 44596 41811 34241 28656 25396 19184 10854 8889 6779 19 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
MissingPct 99.060000 98.140000 93.600000 75.100000 70.410000 57.660000 48.260000 42.770000 32.310000 18.280000 14.970000 11.420000 0.030000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Zeros 75 744 769 2135 3 4 1 19 9 4042 4789 44659 3688 59027 58986 58975 58969 58936 58934 58915 58898 58877 58869 58850 58837 58829 58787 58784 58763 58763 58745 58734 58683 58677 58633 58626 58583 58570 58568 58555 58517 58514 58494 58259 58199 58155 58124 58025 57894 57216 57174 56887 56677 56455 56145 55999 55936 55427 54069 53575 48071 1036 287 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ZerosPct 0.130000 1.250000 1.300000 3.600000 0.010000 0.010000 0.000000 0.030000 0.020000 6.810000 8.060000 75.210000 6.210000 99.400000 99.330000 99.320000 99.310000 99.250000 99.250000 99.220000 99.190000 99.150000 99.140000 99.110000 99.080000 99.070000 99.000000 98.990000 98.960000 98.960000 98.930000 98.910000 98.820000 98.810000 98.740000 98.730000 98.660000 98.630000 98.630000 98.610000 98.540000 98.540000 98.510000 98.110000 98.010000 97.940000 97.880000 97.720000 97.500000 96.350000 96.280000 95.800000 95.450000 95.070000 94.550000 94.300000 94.200000 93.340000 91.050000 90.220000 80.950000 1.740000 0.480000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
count 557.000000 1107.000000 3801.000000 14785.000000 17570.000000 25140.000000 30725.000000 33985.000000 40197.000000 48527.000000 50492.000000 52602.000000 59362.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000
mean 141.118492 11.965673 50.635622 123.760974 0.484635 0.497737 0.474550 0.001733 0.444890 0.361469 7.962172 0.006283 0.077582 0.005962 0.006652 0.006837 0.006938 0.007494 0.007528 0.007848 0.008134 0.008488 0.008622 0.008942 0.009161 0.009296 0.010003 0.010054 0.010407 0.010407 0.010710 0.010896 0.011755 0.011856 0.012597 0.012715 0.013439 0.013658 0.013691 0.013910 0.014550 0.014601 0.014937 0.018895 0.019905 0.020646 0.021168 0.022836 0.025042 0.036459 0.037167 0.042000 0.045536 0.049275 0.054496 0.056954 0.058015 0.066587 0.089456 0.097775 0.190465 0.328952 0.405567 0.707283 0.292587 0.469462 39507.211515 1.026355 24.415655 2.006955 2.673599 1.043583 8.641821 1.300904 2.142958 1.209326 2.007427 5.835840 2.883666 1.027180 1.409188 1.038531 1.727606 1.055792 2.146983 1.958707 1.901989 2.048484 2.419360 2.686230 253.987100 2.102171 1.654873 1.007359 2.889897 2.012277 2.044088 1.769943 2.993836 2.056601 2.768141 2.968542 1.327529 2.978006 1.053536 1.034455 1.985079 1.108991 1.981644 2.528115 1.194961 2.808979 2.980213 1.067210 2.542699 2.040771 2.985265 2.804618 2.689076 1.002055 2.179468 1.938398 1.004850 2.830720 2.967599 1.641064 5.636837
std 107.759559 38.718774 78.149069 98.516206 0.129200 0.140187 0.154959 0.007338 0.163012 0.349551 13.027697 0.032816 0.082347 0.076981 0.081289 0.082405 0.083007 0.086244 0.086436 0.088239 0.089821 0.091737 0.092456 0.094141 0.095275 0.095967 0.099515 0.099764 0.101485 0.101485 0.102937 0.103813 0.107780 0.108237 0.111526 0.112040 0.115145 0.116066 0.116207 0.117119 0.119744 0.119949 0.121304 0.136155 0.139676 0.142198 0.143947 0.149380 0.156253 0.187432 0.189172 0.200591 0.208479 0.216443 0.226995 0.231757 0.233774 0.249307 0.285404 0.297013 0.392671 0.282562 0.197190 0.074239 0.089037 0.122213 22815.883089 0.160191 5.072885 0.083107 0.739103 0.291949 4.227082 0.715034 0.350033 0.417939 0.085858 2.674536 0.320627 0.231566 0.491688 0.274915 0.445195 0.329328 0.989139 0.945739 0.971223 0.755149 0.509577 0.483159 178.621154 0.303098 0.475414 0.085864 0.456128 0.172360 0.291353 0.421032 0.095340 0.231153 0.640259 0.197715 0.740118 0.146778 0.225848 0.182859 0.121375 0.311847 0.134236 0.849170 0.406082 0.393237 0.197652 0.250589 0.839904 0.198100 0.170989 0.593798 0.724661 0.063806 0.412633 0.240574 0.069474 0.556665 0.252427 0.933361 2.456833
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 1.000000 1.000000 2.000000 1.000000 1.000000 1.000000 1.000000 2.000000 1.000000 2.000000 1.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
25% 8.000000 0.000000 1.000000 17.000000 0.401786 0.401961 0.362319 0.000400 0.323944 0.060000 2.000000 0.000000 0.035000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.076923 0.238806 0.654545 0.225941 0.385517 19780.000000 1.000000 26.000000 2.000000 3.000000 1.000000 9.000000 1.000000 2.000000 1.000000 2.000000 3.000000 3.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 2.000000 2.000000 112.000000 2.000000 1.000000 1.000000 3.000000 2.000000 2.000000 2.000000 3.000000 2.000000 3.000000 3.000000 1.000000 3.000000 1.000000 1.000000 2.000000 1.000000 2.000000 3.000000 1.000000 3.000000 3.000000 1.000000 3.000000 2.000000 3.000000 3.000000 3.000000 1.000000 2.000000 2.000000 1.000000 3.000000 3.000000 1.000000 4.000000
50% 229.000000 0.000000 8.000000 117.000000 0.508929 0.519608 0.463768 0.000973 0.422535 0.250000 4.000000 0.000000 0.060000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.230769 0.402985 0.709091 0.288703 0.451349 39487.000000 1.000000 26.000000 2.000000 3.000000 1.000000 9.000000 1.000000 2.000000 1.000000 2.000000 6.000000 3.000000 1.000000 1.000000 1.000000 2.000000 1.000000 3.000000 2.000000 1.000000 2.000000 2.000000 3.000000 162.000000 2.000000 2.000000 1.000000 3.000000 2.000000 2.000000 2.000000 3.000000 2.000000 3.000000 3.000000 1.000000 3.000000 1.000000 1.000000 2.000000 1.000000 2.000000 3.000000 1.000000 3.000000 3.000000 1.000000 3.000000 2.000000 3.000000 3.000000 3.000000 1.000000 2.000000 2.000000 1.000000 3.000000 3.000000 1.000000 6.000000
75% 240.000000 2.000000 64.000000 240.000000 0.580357 0.598039 0.579710 0.002000 0.563380 0.550000 9.000000 0.000000 0.100000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.487179 0.567164 0.763636 0.345188 0.532858 59211.000000 1.000000 26.000000 2.000000 3.000000 1.000000 9.000000 1.000000 2.000000 1.000000 2.000000 8.000000 3.000000 1.000000 2.000000 1.000000 2.000000 1.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 418.000000 2.000000 2.000000 1.000000 3.000000 2.000000 2.000000 2.000000 3.000000 2.000000 3.000000 3.000000 1.000000 3.000000 1.000000 1.000000 2.000000 1.000000 2.000000 3.000000 1.000000 3.000000 3.000000 1.000000 3.000000 2.000000 3.000000 3.000000 3.000000 1.000000 2.000000 2.000000 1.000000 3.000000 3.000000 3.000000 8.000000
max 240.000000 240.000000 240.000000 240.000000 1.000000 1.000000 1.000000 1.000000 0.943662 1.000000 240.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 79146.000000 2.000000 38.000000 3.000000 3.000000 3.000000 38.000000 3.000000 3.000000 3.000000 3.000000 11.000000 3.000000 3.000000 2.000000 3.000000 2.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 648.000000 3.000000 2.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 2.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 2.000000 3.000000 3.000000 3.000000 8.000000

Create Binary Classification Variable

Here our target has 8 classes, looking the class distribution we see that class 8 has the highest counts, we can treat this as 1 and all others as 0.

In [293]:
cond = df[target] == 8
df['Response8'] = np.where(cond,1,0)

df.filter(regex='Resp').head()
Out[293]:
Response Response8
0 8 1
1 4 0
2 8 1
3 8 1
4 8 1

Feature Engineering

Data Types

In [294]:
df.shape
Out[294]:
(59381, 129)
In [295]:
df.select_dtypes('number').shape
Out[295]:
(59381, 128)
In [296]:
df.select_dtypes('object').head()
Out[296]:
Product_Info_2
0 D3
1 A1
2 E1
3 D4
4 D2
In [297]:
"""
One of the feature Product_Info_2 is object.
Create new features from this.

""";
In [298]:
df['Product_Info_2_char'] = df['Product_Info_2'].str[0]
df['Product_Info_2_num'] = df['Product_Info_2'].str[1].astype(int)
In [299]:
df.Product_Info_2_num.isna().sum() # it's good we dont have nans.
Out[299]:
0
In [300]:
df.filter(regex='_Info_2').head(2)
Out[300]:
Product_Info_2 Employment_Info_2 Product_Info_2_char Product_Info_2_num
0 D3 12 D 3
1 A1 1 A 1
In [301]:
df = df.drop('Product_Info_2',axis=1)
df.filter(regex='_Info_2').head(2)
Out[301]:
Employment_Info_2 Product_Info_2_char Product_Info_2_num
0 12 D 3
1 1 A 1

Continuous Variable: square, cube, multiplication

In [302]:
"""
Some important continuous features:
- age bmi ht wt

"""

df['Ins_Age_sq'] = df['Ins_Age'] * df['Ins_Age']
df['Ht_sq'] = df['Ht'] * df['Ht']
df['Wt_sq'] = df['Wt'] * df['Wt']
df['BMI_sq'] = df['BMI'] * df['BMI']

df['Ins_Age_cu'] = df['Ins_Age'] * df['Ins_Age'] * df['Ins_Age']
df['Ht_cu'] = df['Ht'] * df['Ht'] * df['Ht']
df['Wt_cu'] = df['Wt'] * df['Wt'] * df['Wt']
df['BMI_cu'] = df['BMI'] * df['BMI'] * df['BMI']

df['Age_Ht'] = df['Ins_Age'] * df['Ht']
df['Age_Wt'] = df['Ins_Age'] * df['Wt']
df['Age_BMI'] = df['Ins_Age'] * df['BMI']
In [303]:
df.iloc[:2,-10:]
Out[303]:
Ht_sq Wt_sq BMI_sq Ins_Age_cu Ht_cu Wt_cu BMI_cu Age_Ht Age_Wt Age_BMI
0 0.338512 0.022063 0.104334 0.264351 0.196953 0.003277 0.033701 0.373406 0.095329 0.207304
1 0.360000 0.017371 0.074141 0.000213 0.216000 0.002289 0.020188 0.035821 0.007869 0.016256

Continuous Variable: Quantile Binning

In [304]:
def quantile_binning(ser):
    conditions = [
        (ser <= ser.quantile(0.25)),
        (ser > ser.quantile(0.25)) & (ser <= ser.quantile(0.75)),
        (ser > ser.quantile(0.75))]

    choices = ['low', 'medium', 'high']
    return np.select(conditions, choices)
In [305]:
df['Age_cat'] = quantile_binning(df['Ins_Age'])
df['Ht_cat'] = quantile_binning(df['Ht'])
df['Wt_cat'] = quantile_binning(df['Wt'])
df['BMI_cat'] = quantile_binning(df['BMI'])
In [306]:
quantile_cat = df.filter(regex='_cat$').columns.to_list()
print(quantile_cat)
df.filter(regex='_cat$').head(2)
['Age_cat', 'Ht_cat', 'Wt_cat', 'BMI_cat']
Out[306]:
Age_cat Ht_cat Wt_cat BMI_cat
0 high low low low
1 low low low low
In [307]:
df_resp8_0 = df.loc[df['Response8']==0]
df_resp8_1 = df.loc[df['Response8']==1]
mapping = {'low':0, 'medium':1,'high':2}
target8 = "Response8"

plt.figure()
fig, axes = plt.subplots(2,2,figsize=(24,18))
for i,col in enumerate(quantile_cat):
    i+=1
    plt.subplot(2,2,i)

    x0 = df_resp8_0[col].replace(mapping).to_numpy()
    x1 = df_resp8_1[col].replace(mapping).to_numpy()

    a = x0 / np.dot(x0,x0)
    b = x1 / np.dot(x1,x1)
    # overlap = np.dot(a,b)

    sns.kdeplot(x0, bw=0.5,label=f"{target8} = 0",shade=1)
    sns.kdeplot(x1, bw=0.5,label=f"{target8} = 1",shade=1)

    plt.xlabel(col, fontsize=18)
    plt.legend(loc='upper right',fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
<Figure size 432x288 with 0 Axes>
In [308]:
sns.scatterplot(data=df,x='BMI',y='Wt',hue='Response8',alpha=1)
# when bmi is high and wt is high, applicant is most likely rejected.
Out[308]:
<matplotlib.axes._subplots.AxesSubplot at 0x7febb0c85f90>
In [309]:
# df.bp.plot_cat_cat('Age_cat','Response8')
# df.bp.plot_cat_cat('Ht_cat','Response8')
# df.bp.plot_cat_cat('Wt_cat','Response8')
df.bp.plot_cat_cat('BMI_cat','Response8')
==================================================
Feature: **BMI_cat**
Overall Count: 
    medium: 50.04%
    low: 25.01%
    high: 24.94%

Total  **Response8_1** distribution:
    medium: 53.51%
    low: 45.08%
    high: 1.41%

Per BMI_cat  **Response8_1** distribution:
    low: 59.15%
    medium: 35.09%
    high: 1.86%
In [310]:
"""
Look at the top right figure.

Age: when age is high/medium ==> more rejection

Ht : when height_cat is not useful. for all category of ht, distribution is different.

Wt : Wt==high is important feature (almost all rejected)

BMI: bmi==high is important feature (almost all rejected)
""";
In [311]:
cat = 'BMI_cat'
df1 = df[[cat,'Response8']]
df1 = df1[df1[cat]=='high']
df1['Response8'].value_counts(normalize=True)
Out[311]:
0    0.981433
1    0.018567
Name: Response8, dtype: float64
In [312]:
pd.crosstab(df['BMI_cat'],df['Response8'],normalize='index')
Out[312]:
Response8 0 1
BMI_cat
high 0.981433 0.018567
low 0.408470 0.591530
medium 0.649090 0.350910
In [313]:
df.filter(regex='_cat$').head(2)
Out[313]:
Age_cat Ht_cat Wt_cat BMI_cat
0 high low low low
1 low low low low
In [314]:
# when a feature is medium is medium, they may be less risky.
def risk_medium_bool(key):
    cond = (df[key] == 'medium')
    return np.where(cond,0,1)
In [315]:
df['risk_Age_medium_bool'] = risk_medium_bool('Age_cat')
df['risk_Ht_medium_bool'] = risk_medium_bool('Ht_cat')
df['risk_Wt_medium_bool'] = risk_medium_bool('Wt_cat')
df['risk_BMI_medium_bool'] = risk_medium_bool('BMI_cat')
In [316]:
def risk_at_least_one(key):
    cond = ((df['Age_cat'] == key) | 
            (df['Ht_cat']  == key) |
            (df['Wt_cat']  == key) |
            (df['BMI_cat'] == key))

    return np.where(cond,1,0)
In [317]:
df['risk_at_least_one_low'] = risk_at_least_one('low') # possible risk
df['risk_at_least_one_high'] = risk_at_least_one('high') # possible risk
In [318]:
cond = ((df['Age_cat'] == 'high') |
        (df['Wt_cat']  == 'high') |
        (df['BMI_cat'] == 'high'))

df['risk_extreme'] = np.where(cond,1,0)
In [319]:
df.filter(regex='_cat$|^risk_').head()
Out[319]:
Age_cat Ht_cat Wt_cat BMI_cat risk_Age_medium_bool risk_Ht_medium_bool risk_Wt_medium_bool risk_BMI_medium_bool risk_at_least_one_low risk_at_least_one_high risk_extreme
0 high low low low 1 1 1 1 1 1 1
1 low low low low 1 1 1 1 1 0 0
2 low medium medium medium 1 0 0 0 1 0 0
3 low medium low low 1 0 1 1 1 0 0
4 medium low medium medium 0 1 0 0 1 0 0
In [320]:
cols_risk_bool = df.filter(regex='^risk_.*_bool$').columns.to_list()
cols_risk_bool
Out[320]:
['risk_Age_medium_bool',
 'risk_Ht_medium_bool',
 'risk_Wt_medium_bool',
 'risk_BMI_medium_bool']
In [321]:
df_resp8_0 = df.loc[df['Response8']==0]
df_resp8_1 = df.loc[df['Response8']==1]
target8 = "Response8"

plt.figure()
fig, axes = plt.subplots(2,2,figsize=(24,18))
for i,col in enumerate(cols_risk_bool):
    i+=1
    plt.subplot(2,2,i)

    x0 = df_resp8_0[col]
    x1 = df_resp8_1[col]

    sns.kdeplot(x0, bw=0.5,label=f"{target8} = 0",shade=1)
    sns.kdeplot(x1, bw=0.5,label=f"{target8} = 1",shade=1)

    plt.xlabel(col, fontsize=18)
    plt.legend(loc='upper right',fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
<Figure size 432x288 with 0 Axes>
In [322]:
# drop unwanted columns
In [323]:
# risk_Age_medium_bool and risk_Wt_medium_bool are useless
df = df.drop(['risk_Age_medium_bool','risk_Ht_medium_bool', 'risk_Wt_medium_bool'],axis=1)

Medical Keyword Features

In [324]:
df.filter(regex='Med').head(2)
Out[324]:
Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4 Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8 Medical_History_9 Medical_History_10 Medical_History_11 Medical_History_12 Medical_History_13 Medical_History_14 Medical_History_15 Medical_History_16 Medical_History_17 Medical_History_18 Medical_History_19 Medical_History_20 Medical_History_21 Medical_History_22 Medical_History_23 Medical_History_24 Medical_History_25 Medical_History_26 Medical_History_27 Medical_History_28 Medical_History_29 Medical_History_30 Medical_History_31 Medical_History_32 Medical_History_33 Medical_History_34 Medical_History_35 Medical_History_36 Medical_History_37 Medical_History_38 Medical_History_39 Medical_History_40 Medical_History_41 Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48
0 4.0 112 2 1 1 3 2 2 1 NaN 3 2 3 3 240.0 3 3 1 1 2 1 2 3 NaN 1 3 3 1 3 2 3 NaN 1 3 1 2 2 1 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 5.0 412 2 1 1 3 2 2 1 NaN 3 2 3 3 0.0 1 3 1 1 2 1 2 3 NaN 1 3 3 1 3 2 3 NaN 3 1 1 2 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [325]:
df.filter(regex='Medical_Key').head(2)
Out[325]:
Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [326]:
"""
There are 48 medical keyword features.
We will create new features based on few statistics such as min, max, mean, count

""";
In [327]:
cols_med_kw = df.filter(regex='Medical_Keyword').columns
cols_med_kw
Out[327]:
Index(['Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3',
       'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6',
       'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9',
       'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12',
       'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15',
       'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18',
       'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21',
       'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24',
       'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27',
       'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30',
       'Medical_Keyword_31', 'Medical_Keyword_32', 'Medical_Keyword_33',
       'Medical_Keyword_34', 'Medical_Keyword_35', 'Medical_Keyword_36',
       'Medical_Keyword_37', 'Medical_Keyword_38', 'Medical_Keyword_39',
       'Medical_Keyword_40', 'Medical_Keyword_41', 'Medical_Keyword_42',
       'Medical_Keyword_43', 'Medical_Keyword_44', 'Medical_Keyword_45',
       'Medical_Keyword_46', 'Medical_Keyword_47', 'Medical_Keyword_48'],
      dtype='object')
In [328]:
df['Medical_Keyword_count'] = df[cols_med_kw].sum(axis=1)
df['Medical_Keyword_min'] = df[cols_med_kw].min(axis=1)
df['Medical_Keyword_max'] = df[cols_med_kw].max(axis=1)
df['Medical_Keyword_mean'] = df[cols_med_kw].mean(axis=1)
df['Medical_Keyword_std'] = df[cols_med_kw].std(axis=1)
df['Medical_Keyword_skew'] = df[cols_med_kw].skew(axis=1)
df['Medical_Keyword_kurtosis'] = df[cols_med_kw].kurtosis(axis=1)
df['Medical_Keyword_median'] = df[cols_med_kw].median(axis=1)
In [329]:
cols_med_kw_agg = df.filter(regex='^Medical_Keyword_\D').columns.to_list()
cols_med_kw_agg
Out[329]:
['Medical_Keyword_count',
 'Medical_Keyword_min',
 'Medical_Keyword_max',
 'Medical_Keyword_mean',
 'Medical_Keyword_std',
 'Medical_Keyword_skew',
 'Medical_Keyword_kurtosis',
 'Medical_Keyword_median']
In [330]:
df[cols_med_kw_agg].describe()
Out[330]:
Medical_Keyword_count Medical_Keyword_min Medical_Keyword_max Medical_Keyword_mean Medical_Keyword_std Medical_Keyword_skew Medical_Keyword_kurtosis Medical_Keyword_median
count 59381.000000 59381.0 59381.000000 59381.000000 59381.000000 59381.000000 59381.000000 59381.0
mean 1.264765 0.0 0.603931 0.026349 0.118569 3.201537 18.044050 0.0
std 1.480236 0.0 0.489083 0.030838 0.105694 2.872417 19.718855 0.0
min 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 -1.532609 0.0
25% 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0
50% 1.000000 0.0 1.000000 0.020833 0.144338 3.732451 12.448567 0.0
75% 2.000000 0.0 1.000000 0.041667 0.201941 6.928203 48.000000 0.0
max 16.000000 0.0 1.000000 0.333333 0.476393 6.928203 48.000000 0.0
In [331]:
df[cols_med_kw_agg].nunique()
Out[331]:
Medical_Keyword_count        16
Medical_Keyword_min           1
Medical_Keyword_max           2
Medical_Keyword_mean         16
Medical_Keyword_std          42
Medical_Keyword_skew         99
Medical_Keyword_kurtosis    122
Medical_Keyword_median        1
dtype: int64
In [332]:
# drop features that have only one value
cols_drop = df[cols_med_kw_agg].nunique().loc[lambda x: x==1].index.to_list()
cols_drop
Out[332]:
['Medical_Keyword_min', 'Medical_Keyword_median']
In [333]:
df = df.drop(cols_drop, axis=1)

Target Based Features

There is high risk of data leakage using target based features, however, we can always use the approach of try and see.

In [334]:
df[cols_med_kw].head(2)
Out[334]:
Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [335]:
# df[cols_med_kw].apply(lambda x: x.nunique())
# all these features are binary.
In [336]:
# df['target_min_med_kw'] = df.groupby(cols_med_kw.to_list())[target].transform('min')
# df['target_mean_med_kw'] = df.groupby(cols_med_kw.to_list())[target].transform('mean')

# i don't like target based features.

Missing Values Imputation

In [337]:
df.isna().sum().sum()
Out[337]:
393103
In [338]:
df['nan_count'] = df.isna().sum(axis=1)
df.iloc[:2,-2:]
Out[338]:
Medical_Keyword_kurtosis nan_count
0 0.0 6
1 0.0 5
In [339]:
df.bp.missing()
Number of missing values features: 13
cols_missing_high = ['Medical_History_10', 'Medical_History_32', 'Medical_History_24']
cols_missing_low = ['Medical_History_15', 'Family_Hist_5', 'Family_Hist_3', 'Family_Hist_2', 'Insurance_History_5', 'Family_Hist_4', 'Employment_Info_6', 'Medical_History_1', 'Employment_Info_4', 'Employment_Info_1']
Out[339]:
Feature Type Count Missing Zeros Unique MissingPct ZerosPct count mean std min 25% 50% 75% max
46 Medical_History_10 float64 59381 58824 75 103 99.060000 0.130000 557.000000 141.118492 107.759559 0.000000 8.000000 229.000000 240.000000 240.000000
68 Medical_History_32 float64 59381 58274 744 95 98.140000 1.250000 1107.000000 11.965673 38.718774 0.000000 0.000000 0.000000 2.000000 240.000000
60 Medical_History_24 float64 59381 55580 769 227 93.600000 1.300000 3801.000000 50.635622 78.149069 0.000000 1.000000 8.000000 64.000000 240.000000
51 Medical_History_15 float64 59381 44596 2135 241 75.100000 3.600000 14785.000000 123.760974 98.516206 0.000000 17.000000 117.000000 240.000000 240.000000
36 Family_Hist_5 float64 59381 41811 3 90 70.410000 0.010000 17570.000000 0.484635 0.129200 0.000000 0.401786 0.508929 0.580357 1.000000
34 Family_Hist_3 float64 59381 34241 4 90 57.660000 0.010000 25140.000000 0.497737 0.140187 0.000000 0.401961 0.519608 0.598039 1.000000
33 Family_Hist_2 float64 59381 28656 1 68 48.260000 0.000000 30725.000000 0.474550 0.154959 0.000000 0.362319 0.463768 0.579710 1.000000
28 Insurance_History_5 float64 59381 25396 19 2265 42.770000 0.030000 33985.000000 0.001733 0.007338 0.000000 0.000400 0.000973 0.002000 1.000000
35 Family_Hist_4 float64 59381 19184 9 68 32.310000 0.020000 40197.000000 0.444890 0.163012 0.000000 0.323944 0.422535 0.563380 0.943662
16 Employment_Info_6 float64 59381 10854 4042 992 18.280000 6.810000 48527.000000 0.361469 0.349551 0.000000 0.060000 0.250000 0.550000 1.000000
37 Medical_History_1 float64 59381 8889 4789 171 14.970000 8.060000 50492.000000 7.962172 13.027697 0.000000 2.000000 4.000000 9.000000 240.000000
14 Employment_Info_4 float64 59381 6779 44659 871 11.420000 75.210000 52602.000000 0.006283 0.032816 0.000000 0.000000 0.000000 0.000000 1.000000
11 Employment_Info_1 float64 59381 19 3688 1936 0.030000 6.210000 59362.000000 0.077582 0.082347 0.000000 0.035000 0.060000 0.100000 1.000000
In [340]:
# remove features with >= 80% missing values
cols_missing_high = ['Medical_History_10', 
                     'Medical_History_32',
                     'Medical_History_24']
df = df.drop(cols_missing_high,axis=1)
In [341]:
 cols_missing_low = ['Medical_History_15', 'Family_Hist_5', 
                     'Family_Hist_3', 'Family_Hist_2', 
                     'Insurance_History_5', 'Family_Hist_4', 
                     'Employment_Info_6', 'Medical_History_1',
                     'Employment_Info_4', 'Employment_Info_1']
In [342]:
df = df.fillna(-1)
In [343]:
df.isna().sum().sum()
Out[343]:
0

Categorical Encoding

The following variables are all categorical (nominal):

Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41

The following variables are continuous:

Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5

The following variables are discrete:

Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32

Medical_Keyword_1-48 are dummy variables.

Quantitative variables can be classified as discrete or continuous.

Categorical variable definition

Categorical variables contain a finite number of categories or distinct groups. Categorical data might not have a logical order. For example, categorical predictors include gender, material type, and payment method.

Discrete variable definition
Discrete variables are numeric variables that have a countable number of values between any two values. A discrete variable is always numeric. For example, the number of customer complaints or the number of flaws or defects.

In [344]:
# df.dtypes.loc[lambda x: x=='object']
In [345]:
df.select_dtypes('object').head(2)
Out[345]:
Product_Info_2_char Age_cat Ht_cat Wt_cat BMI_cat
0 D high low low low
1 A low low low low
In [346]:
df['Product_Info_2_char'].nunique() # for small number we can use OHE
Out[346]:
5
In [347]:
cols_discrete = ['Medical_History_1', 'Medical_History_10',
                     'Medical_History_15', 'Medical_History_24',
                     'Medical_History_32']

cols_discrete = [i for i in cols_discrete if i not in cols_missing_high]
df[cols_discrete].head()
Out[347]:
Medical_History_1 Medical_History_15
0 4.0 240.0
1 5.0 0.0
2 10.0 -1.0
3 0.0 -1.0
4 -1.0 -1.0
In [348]:
df[cols_discrete].nunique()
Out[348]:
Medical_History_1     172
Medical_History_15    242
dtype: int64
In [349]:
# df_copy = df.copy()
In [350]:
col = 'Medical_History_1'
# df[col].value_counts()
In [351]:
col = 'Medical_History_1'
top_col = df[col].value_counts()[lambda x: x> 10].index

df.loc[~df[col].isin(top_col),col] = -2
In [352]:
col = 'Medical_History_15'
# df[col].value_counts().head(200).reset_index()
In [353]:
# the value count is steadily decreasing, there is no obvious cut-off point.
In [354]:
cols_cat = ['Product_Info_1', 'Product_Info_2',
            'Product_Info_3', 'Product_Info_5',
            'Product_Info_6', 'Product_Info_7',
            'Employment_Info_2', 'Employment_Info_3',
            'Employment_Info_5', 'InsuredInfo_1',
            'InsuredInfo_2', 'InsuredInfo_3',
            'InsuredInfo_4', 'InsuredInfo_5',
            'InsuredInfo_6', 'InsuredInfo_7', 
            'Insurance_History_1', 'Insurance_History_2', 
            'Insurance_History_3', 'Insurance_History_4',
            'Insurance_History_7', 'Insurance_History_8',
            'Insurance_History_9', 'Family_Hist_1',
            'Medical_History_2', 'Medical_History_3',
            'Medical_History_4', 'Medical_History_5', 
            'Medical_History_6', 'Medical_History_7',
            'Medical_History_8', 'Medical_History_9', 
            'Medical_History_11', 'Medical_History_12',
            'Medical_History_13', 'Medical_History_14', 
            'Medical_History_16', 'Medical_History_17', 
            'Medical_History_18', 'Medical_History_19',
            'Medical_History_20', 'Medical_History_21',
            'Medical_History_22', 'Medical_History_23',
            'Medical_History_25', 'Medical_History_26', 
            'Medical_History_27', 'Medical_History_28', 
            'Medical_History_29', 'Medical_History_30', 
            'Medical_History_31', 'Medical_History_33',
            'Medical_History_34', 'Medical_History_35', 
            'Medical_History_36', 'Medical_History_37', 
            'Medical_History_38', 'Medical_History_39', 
            'Medical_History_40', 'Medical_History_41']

cols_cat = [i for i in cols_cat if i not in ['Product_Info_2']]
# df[cols_cat].nunique()
In [355]:
df[cols_cat].nunique().loc[lambda x: x>10]
Out[355]:
Product_Info_3        34
Employment_Info_2     36
InsuredInfo_3         11
Medical_History_2    579
dtype: int64
In [356]:
# df['Medical_History_2'].value_counts().head(200)
In [357]:
col = 'Medical_History_2'
top_col = df[col].value_counts()[lambda x: x> 100].index
df.loc[~df[col].isin(top_col),col] = -2
In [358]:
# df.nunique().loc[lambda x: x>90]
In [359]:
cols = df.nunique().loc[lambda x: x>90].index.to_list()
# df[cols].dtypes
df[cols].head(2)
Out[359]:
Id Product_Info_4 Wt BMI Employment_Info_1 Employment_Info_4 Employment_Info_6 Insurance_History_5 Family_Hist_3 Family_Hist_5 Medical_History_15 Wt_sq BMI_sq Wt_cu BMI_cu Age_Ht Age_Wt Age_BMI Medical_Keyword_skew Medical_Keyword_kurtosis
0 2 0.076923 0.148536 0.323008 0.028 0.0 -1.0000 0.000667 0.598039 0.526786 240.0 0.022063 0.104334 0.003277 0.033701 0.373406 0.095329 0.207304 0.0 0.0
1 5 0.076923 0.131799 0.272288 0.000 0.0 0.0018 0.000133 -1.000000 -1.000000 0.0 0.017371 0.074141 0.002289 0.020188 0.035821 0.007869 0.016256 0.0 0.0

One hot encode categorical features

In [360]:
risk_cat = ['Age_cat', 'Ht_cat', 'Wt_cat', 'BMI_cat']
cols_cat_all = cols_cat + cols_discrete + risk_cat + ['Product_Info_2_char']
print(f"Number of categorical features: {len(cols_cat_all)}")
Number of categorical features: 66
In [361]:
# df[cols_cat].nunique()
In [362]:
print(f"Shape of df before encoding: {df.shape}")
Shape of df before encoding: (59381, 153)
In [363]:
df_encoded = pd.get_dummies(df,columns=cols_cat_all,drop_first=True)
In [364]:
print(f"Shape of df after encoding: {df_encoded.shape}")
Shape of df after encoding: (59381, 640)
In [365]:
df_encoded.select_dtypes('object').shape
Out[365]:
(59381, 0)
In [366]:
df_encoded.sum().sum() # this needs to a number
Out[366]:
2351103715.742992

Normalize Numerical Features

Here we already have normalized numerical features. I will not log transform and normalize the numerical features.

Correlations

In [367]:
df.filter(regex='Resp').head(2)
Out[367]:
Response Response8
0 8 1
1 4 0
In [368]:
df.head(2)
Out[368]:
Id Product_Info_1 Product_Info_3 Product_Info_4 Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht Wt BMI Employment_Info_1 Employment_Info_2 Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6 InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5 InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2 Insurance_History_3 Insurance_History_4 Insurance_History_5 Insurance_History_7 Insurance_History_8 Insurance_History_9 Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5 Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4 Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8 Medical_History_9 Medical_History_11 Medical_History_12 Medical_History_13 Medical_History_14 Medical_History_15 Medical_History_16 Medical_History_17 Medical_History_18 Medical_History_19 Medical_History_20 Medical_History_21 Medical_History_22 Medical_History_23 Medical_History_25 Medical_History_26 Medical_History_27 Medical_History_28 Medical_History_29 Medical_History_30 Medical_History_31 Medical_History_33 Medical_History_34 Medical_History_35 Medical_History_36 Medical_History_37 Medical_History_38 Medical_History_39 Medical_History_40 Medical_History_41 Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48 Response Response8 Product_Info_2_char Product_Info_2_num Ins_Age_sq Ht_sq Wt_sq BMI_sq Ins_Age_cu Ht_cu Wt_cu BMI_cu Age_Ht Age_Wt Age_BMI Age_cat Ht_cat Wt_cat BMI_cat risk_BMI_medium_bool risk_at_least_one_low risk_at_least_one_high risk_extreme Medical_Keyword_count Medical_Keyword_max Medical_Keyword_mean Medical_Keyword_std Medical_Keyword_skew Medical_Keyword_kurtosis nan_count
0 2 1 10 0.076923 2 1 1 0.641791 0.581818 0.148536 0.323008 0.028 12 1 0.0 3 -1.0000 1 2 6 3 1 2 1 1 1 3 1 0.000667 1 1 2 2 -1.000000 0.598039 -1.000000 0.526786 4.0 112 2 1 1 3 2 2 1 3 2 3 3 240.0 3 3 1 1 2 1 2 3 1 3 3 1 3 2 3 1 3 1 2 2 1 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 1 D 3 0.411896 0.338512 0.022063 0.104334 0.264351 0.196953 0.003277 0.033701 0.373406 0.095329 0.207304 high low low low 1 1 1 1 0 0 0.0 0.0 0.0 0.0 6
1 5 1 26 0.076923 2 3 1 0.059701 0.600000 0.131799 0.272288 0.000 1 3 0.0 2 0.0018 1 2 6 3 1 2 1 2 1 3 1 0.000133 1 3 2 2 0.188406 -1.000000 0.084507 -1.000000 5.0 412 2 1 1 3 2 2 1 3 2 3 3 0.0 1 3 1 1 2 1 2 3 1 3 3 1 3 2 3 3 1 1 2 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 A 1 0.003564 0.360000 0.017371 0.074141 0.000213 0.216000 0.002289 0.020188 0.035821 0.007869 0.016256 low low low low 1 1 0 0 0 0 0.0 0.0 0.0 0.0 5
In [369]:
cols_drop = ['Id','Response','Response8']

(df.drop(cols_drop,axis=1)
  .corrwith(df[target8]).abs()
  .to_frame()
  .sort_values([0],ascending=False).T)
Out[369]:
BMI Wt BMI_sq risk_extreme Wt_sq risk_at_least_one_high BMI_cu Age_Wt Age_BMI Wt_cu Medical_History_23 Medical_Keyword_15 Medical_Keyword_std Medical_Keyword_mean Medical_Keyword_count Medical_History_4 Medical_Keyword_max risk_at_least_one_low Age_Ht Ins_Age Ins_Age_sq Ins_Age_cu InsuredInfo_6 Medical_Keyword_skew Product_Info_4 Family_Hist_2 Family_Hist_3 Family_Hist_5 Medical_Keyword_3 Family_Hist_4 Ht_sq Ht Ht_cu Medical_History_39 nan_count Medical_Keyword_25 Medical_Keyword_48 Medical_History_6 Medical_History_16 InsuredInfo_1 Medical_Keyword_23 Medical_History_33 Medical_Keyword_kurtosis Family_Hist_1 Product_Info_3 Medical_History_30 Employment_Info_5 Medical_History_29 Medical_History_40 Medical_History_13 Medical_History_15 Medical_Keyword_1 Insurance_History_2 Medical_Keyword_24 Medical_History_1 InsuredInfo_7 InsuredInfo_2 Medical_Keyword_22 Medical_Keyword_38 Medical_History_9 Medical_Keyword_10 Medical_Keyword_42 Medical_History_17 Medical_Keyword_33 Employment_Info_3 Medical_Keyword_35 InsuredInfo_5 Insurance_History_4 Medical_Keyword_40 Medical_History_18 Medical_History_21 Medical_Keyword_37 risk_BMI_medium_bool Medical_Keyword_46 Medical_History_12 Medical_History_27 Medical_History_34 Insurance_History_1 Insurance_History_9 Insurance_History_3 Medical_History_8 Medical_History_20 Insurance_History_5 Insurance_History_7 Medical_History_5 Medical_Keyword_27 Medical_Keyword_47 Medical_Keyword_31 Medical_History_38 Medical_History_28 InsuredInfo_4 Employment_Info_4 Medical_Keyword_4 Medical_Keyword_43 Medical_Keyword_13 Medical_Keyword_19 Medical_Keyword_36 Medical_Keyword_16 Medical_Keyword_28 Medical_History_14 Medical_History_31 Medical_History_41 Medical_History_22 Product_Info_2_num Employment_Info_2 Medical_Keyword_14 Medical_Keyword_18 Medical_Keyword_7 Product_Info_6 Insurance_History_8 Medical_Keyword_9 Medical_History_26 Medical_History_25 Medical_Keyword_21 Medical_Keyword_12 Medical_Keyword_5 Medical_History_35 Product_Info_1 Medical_Keyword_26 Product_Info_5 Medical_Keyword_34 Medical_Keyword_39 Medical_History_36 Medical_History_37 Medical_Keyword_44 Medical_History_11 Medical_Keyword_29 Medical_Keyword_11 Medical_Keyword_8 Medical_Keyword_2 Medical_Keyword_6 Employment_Info_6 Medical_Keyword_41 Medical_Keyword_17 Employment_Info_1 Medical_History_2 Medical_Keyword_30 Medical_History_7 Medical_Keyword_32 Medical_History_3 Medical_Keyword_20 Product_Info_7 InsuredInfo_3 Medical_Keyword_45 Medical_History_19
0 0.433625 0.41403 0.403152 0.396987 0.383789 0.361736 0.35841 0.356771 0.354639 0.333922 0.326246 0.316473 0.298751 0.279796 0.279796 0.276195 0.264136 0.245505 0.238643 0.221867 0.21643 0.204544 0.195071 0.177664 0.17469 0.170483 0.164334 0.162137 0.154817 0.14683 0.14485 0.14451 0.144213 0.141103 0.129232 0.126469 0.125934 0.125789 0.123336 0.108849 0.104032 0.103753 0.102677 0.102442 0.094939 0.093885 0.0879 0.082683 0.079914 0.077845 0.075528 0.075011 0.073219 0.069341 0.069146 0.063523 0.058789 0.057889 0.057558 0.056355 0.055024 0.053924 0.053666 0.05331 0.05218 0.051943 0.051066 0.049631 0.049363 0.049126 0.048663 0.048582 0.048403 0.048249 0.047336 0.047112 0.044968 0.044362 0.04308 0.042624 0.04262 0.041896 0.041216 0.040921 0.040693 0.03978 0.039269 0.039158 0.038987 0.037907 0.036938 0.036634 0.035512 0.035447 0.032698 0.031829 0.030685 0.030024 0.0296 0.028857 0.028353 0.028307 0.026915 0.025469 0.024966 0.023957 0.023309 0.02269 0.02265 0.022422 0.022342 0.02063 0.020348 0.019899 0.019804 0.019799 0.019696 0.019621 0.019282 0.018793 0.018254 0.018178 0.018109 0.017825 0.01772 0.01735 0.015668 0.014215 0.013843 0.012296 0.012061 0.009104 0.008965 0.00883 0.00795 0.007942 0.007336 0.005564 0.004622 0.00388 0.003785 0.00361 0.003321 0.002213 0.000294

Create features using unsupervied clustering algorithms

In [370]:
cols_drop = ['Id','Response','Response8']
cols_high_corr = (df.drop(cols_drop,axis=1)
                    .corrwith(df[target8]).abs()
                    .sort_values(ascending=False)
                    .head(20)
                    .index.to_list()
                    )

print(cols_high_corr)
['BMI', 'Wt', 'BMI_sq', 'risk_extreme', 'Wt_sq', 'risk_at_least_one_high', 'BMI_cu', 'Age_Wt', 'Age_BMI', 'Wt_cu', 'Medical_History_23', 'Medical_Keyword_15', 'Medical_Keyword_std', 'Medical_Keyword_mean', 'Medical_Keyword_count', 'Medical_History_4', 'Medical_Keyword_max', 'risk_at_least_one_low', 'Age_Ht', 'Ins_Age']
In [371]:
df_high_corr = df[cols_high_corr]
df_high_corr.head(2)
Out[371]:
BMI Wt BMI_sq risk_extreme Wt_sq risk_at_least_one_high BMI_cu Age_Wt Age_BMI Wt_cu Medical_History_23 Medical_Keyword_15 Medical_Keyword_std Medical_Keyword_mean Medical_Keyword_count Medical_History_4 Medical_Keyword_max risk_at_least_one_low Age_Ht Ins_Age
0 0.323008 0.148536 0.104334 1 0.022063 1 0.033701 0.095329 0.207304 0.003277 3 0 0.0 0.0 0 1 0 1 0.373406 0.641791
1 0.272288 0.131799 0.074141 0 0.017371 0 0.020188 0.007869 0.016256 0.002289 3 0 0.0 0.0 0 1 0 1 0.035821 0.059701

Gaussian Mixture

In [372]:
from sklearn.mixture import GaussianMixture
In [373]:
model = GaussianMixture(n_components=len(cols_high_corr), random_state=SEED, reg_covar=1e-3)
model.fit(df_high_corr)

df['cluster_gmix20'] = model.predict(df_high_corr)
In [374]:
df.filter(regex='cluster').head(2)
Out[374]:
cluster_gmix20
0 5
1 10
In [375]:
gc.collect()
Out[375]:
64529

Birch Clustering

In [376]:
from sklearn.cluster import Birch
In [377]:
model = Birch(n_clusters=len(cols_high_corr))
model
Out[377]:
Birch(n_clusters=20)
In [378]:
df['cluster_birch20'] = model.fit(df_high_corr).labels_

df.filter(regex='cluster').head(2)
Out[378]:
cluster_gmix20 cluster_birch20
0 5 13
1 10 13
In [379]:
gc.collect()
Out[379]:
20

Two features Birch Clusterings

In [380]:
model_birch2 = Birch(n_clusters=2)
model_birch2
Out[380]:
Birch(n_clusters=2)
In [381]:
bmi_cols = ['Ins_Age','Wt','Ht',
            'Medical_Keyword_15',
            'Medical_Keyword_count']

for col in bmi_cols:
    dfx = df[['BMI',col]].reset_index(drop=True)
    dfx.columns = ['BMI',col]
    df[f'cluster2_BMI_vs_{col}'] = model_birch2.fit(dfx).labels_
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

In [382]:
cols_first = ['Medical_History_4','Product_Info_4','Age_BMI','Age_BMI']
cols_second = ['Medical_History_23','InsuredInfo_6','Age_Ht','Age_Wt']

for col1, col2 in zip(cols_first, cols_second):
    dfx = df[[col1,col2]].reset_index(drop=True)
    dfx.columns = [col1,col2]

    df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

In [383]:
cols_risk_cluster = ['risk_at_least_one_low',
                     'risk_at_least_one_high',
                     'risk_extreme']

cols_risk_cluster_second = ['Medical_History_23', 'Medical_History_4',
                    'Medical_Keyword_15','Medical_Keyword_count',
                    'Product_Info_4','InsuredInfo_6','BMI']

for col1 in cols_risk_cluster:
    for col2 in cols_risk_cluster_second:
        dfx = df[[col1,col2]].reset_index(drop=True)
        dfx.columns = [col1,col2]
        df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_
In [384]:
df.filter(regex='cluster').head(2)
Out[384]:
cluster_gmix20 cluster_birch20 cluster2_BMI_vs_Ins_Age cluster2_BMI_vs_Wt cluster2_BMI_vs_Ht cluster2_BMI_vs_Medical_Keyword_15 cluster2_BMI_vs_Medical_Keyword_count cluster2_Medical_History_4_vs_Medical_History_23 cluster2_Product_Info_4_vs_InsuredInfo_6 cluster2_Age_BMI_vs_Age_Ht cluster2_Age_BMI_vs_Age_Wt cluster2_risk_at_least_one_low_vs_Medical_History_23 cluster2_risk_at_least_one_low_vs_Medical_History_4 cluster2_risk_at_least_one_low_vs_Medical_Keyword_15 cluster2_risk_at_least_one_low_vs_Medical_Keyword_count cluster2_risk_at_least_one_low_vs_Product_Info_4 cluster2_risk_at_least_one_low_vs_InsuredInfo_6 cluster2_risk_at_least_one_low_vs_BMI cluster2_risk_at_least_one_high_vs_Medical_History_23 cluster2_risk_at_least_one_high_vs_Medical_History_4 cluster2_risk_at_least_one_high_vs_Medical_Keyword_15 cluster2_risk_at_least_one_high_vs_Medical_Keyword_count cluster2_risk_at_least_one_high_vs_Product_Info_4 cluster2_risk_at_least_one_high_vs_InsuredInfo_6 cluster2_risk_at_least_one_high_vs_BMI cluster2_risk_extreme_vs_Medical_History_23 cluster2_risk_extreme_vs_Medical_History_4 cluster2_risk_extreme_vs_Medical_Keyword_15 cluster2_risk_extreme_vs_Medical_Keyword_count cluster2_risk_extreme_vs_Product_Info_4 cluster2_risk_extreme_vs_InsuredInfo_6 cluster2_risk_extreme_vs_BMI
0 5 13 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1
1 10 13 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0
In [385]:
# check if the feature has only one value
df.filter(regex='cluster').nunique().loc[lambda x: x==1]
Out[385]:
cluster2_BMI_vs_Ins_Age               1
cluster2_BMI_vs_Wt                    1
cluster2_BMI_vs_Ht                    1
cluster2_BMI_vs_Medical_Keyword_15    1
cluster2_Age_BMI_vs_Age_Ht            1
cluster2_Age_BMI_vs_Age_Wt            1
dtype: int64
In [386]:
cols_drop = df.filter(regex='cluster').nunique().loc[lambda x: x==1].index.to_list()
print(cols_drop)
['cluster2_BMI_vs_Ins_Age', 'cluster2_BMI_vs_Wt', 'cluster2_BMI_vs_Ht', 'cluster2_BMI_vs_Medical_Keyword_15', 'cluster2_Age_BMI_vs_Age_Ht', 'cluster2_Age_BMI_vs_Age_Wt']
In [387]:
df = df.drop(cols_drop,axis=1)
df.nunique().loc[lambda x: x==1]
Out[387]:
Series([], dtype: int64)

Remove Highly Correlated Features

In [388]:
import json
In [389]:
df.shape
Out[389]:
(59381, 179)
In [390]:
def get_high_correlated_features(df, thr=0.95):
    cols_corr = set()
    df_corr = df.corr()
    for i in range(len(df_corr.columns)):
        for j in range(i):
            if ( abs(df_corr.iloc[i, j]) >= thr) and (df_corr.columns[j] not in cols_corr):
                colname = df_corr.columns[i]
                cols_corr.add(colname)

    return cols_corr
In [391]:
cols_high_corr = get_high_correlated_features(df,thr=0.95)
print(cols_high_corr)
{'Medical_History_36', 'Insurance_History_5', 'cluster2_risk_extreme_vs_Medical_History_4', 'cluster2_Product_Info_4_vs_InsuredInfo_6', 'cluster2_risk_at_least_one_high_vs_BMI', 'Medical_History_26', 'Medical_Keyword_23', 'cluster2_risk_at_least_one_low_vs_Product_Info_4', 'cluster2_Medical_History_4_vs_Medical_History_23', 'Age_BMI', 'Ht_cu', 'Age_Ht', 'cluster2_risk_at_least_one_low_vs_BMI', 'cluster2_risk_at_least_one_high_vs_Product_Info_4', 'cluster2_risk_extreme_vs_Product_Info_4', 'cluster2_risk_extreme_vs_Medical_History_23', 'Ins_Age_sq', 'cluster2_risk_at_least_one_high_vs_InsuredInfo_6', 'Medical_Keyword_mean', 'cluster2_risk_extreme_vs_Medical_Keyword_15', 'Medical_Keyword_11', 'cluster2_risk_at_least_one_low_vs_Medical_History_23', 'Ht_sq', 'cluster2_risk_extreme_vs_Medical_Keyword_count', 'cluster2_risk_extreme_vs_BMI', 'cluster2_risk_at_least_one_high_vs_Medical_History_4', 'cluster2_risk_at_least_one_high_vs_Medical_Keyword_15', 'cluster2_risk_at_least_one_low_vs_InsuredInfo_6', 'BMI_sq', 'Insurance_History_7', 'cluster2_risk_at_least_one_high_vs_Medical_History_23', 'Wt_sq', 'cluster2_risk_extreme_vs_InsuredInfo_6', 'cluster2_risk_at_least_one_low_vs_Medical_History_4', 'Medical_Keyword_48', 'Medical_Keyword_kurtosis', 'Insurance_History_9', 'cluster2_risk_at_least_one_low_vs_Medical_Keyword_15'}
In [392]:
print(len(cols_high_corr))
38
In [393]:
out_json = out_dir + 'correlated_features.json'
print(out_json)
../outputs/Prudential/correlated_features.json
In [394]:
with open(out_json,'w') as fo:
    json.dump(out_json,fo)
In [395]:
df = df.drop(cols_high_corr,axis=1)
df.shape
Out[395]:
(59381, 141)

Time Taken

In [396]:
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 1 min 34 secs
In [ ]: