1 Description
2 Imports
3 Load the data
4 Train-Test split with stratify
5 Target Distribution
- 5.1 Create Binary Classification Variable
6 Feature Engineering
7 Target Based Features
8 Missing Values Imputation
9 Categorical Encoding
- 9.1 One hot encode categorical features
10 Normalize Numerical Features
11 Correlations
12 Create features using unsupervied clustering algorithms
13 Remove Highly Correlated Features
14 Time Taken

Kernel Author:
Bhishan Poudel, Ph.D Astrophysics .

Description¶

In this project we will use multiclass classification to predict one of the 8 possible value of Response.

The data is taken from Kaggle Prudential Life Insurance Project.

About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.

Here category 8 has the highest counts, I assume it the quote that is granted.

Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)

Features:

1 Misc             : Age ht wt bmi              4
2 Product Info     : Product_Info_1 to 7        7
3 Employment Info  : Employment_Info_1 to 6     6
4 Insured Info     : InsuredInfo_1 to 7         7
5 Insurance History: Insurance_History_1 to 9   9
6 Family History   : Family_Hist_1 to 5         5
7 Medical History  : Medical_History_1 to 41    41
8 Medical Keywords : Medical_Keyword_1 to 48    48
Target: Response                                1
ID    : ID                                      1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)

Imports¶

import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True,font_scale=1.5)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline


# random state
SEED = 100
time_start_notebook = time.time()

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

import scipy
from scipy import stats
import IPython
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# Google colab

%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ### mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    ### load the data dir
    dat_dir = 'drive/My Drive/Colab Notebooks/data/'
    sys.path.append(dat_dir)

    ### Image dir
    img_dir = 'drive/My Drive/Colab Notebooks/images/'
    if not os.path.isdir(img_dir): os.makedirs(img_dir)
    sys.path.append(img_dir)

    ### Output dir
    out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
    if not os.path.isdir(out_dir): os.makedirs(out_dir)
    sys.path.append(out_dir)

    ### Also install my custom module
    module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/' 
    sys.path.append(module_dir)
    !cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
    !pip install -e bhishan
    !cd -

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.

if ENV_COLAB:
    out_dir = 'drive/My Drive/Colab Notebooks/outputs'
else:
    out_dir = '../outputs'
    
out_dir = out_dir + '/Prudential/'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

out_dir

'../outputs/Prudential/'

import gc
from tqdm import tqdm
import functools # funtools.lru_cache
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch

import bhishan
from bhishan import bp

%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

%load_ext autoreload
%autoreload 2
%load_ext watermark
%watermark -a "Bhishan Poudel" -dvm
%watermark -iv

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2020-06-22 

CPython 3.7.7
IPython 7.13.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
pandas     1.0.3
IPython    7.13.0
scipy      1.4.1
json       2.0.9
autopep8   1.5.2
matplotlib 3.2.1
bhishan    0.3.1
seaborn    0.10.1
numpy      1.18.4

Load the data¶

df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()

(59381, 128)

target = 'Response'

Train-Test split with stratify¶

from sklearn.model_selection import train_test_split

target = 'Response'

df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],
    test_size=0.2, random_state=SEED, stratify=df[target])

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_orig, ser_ytrain_orig,
    test_size=0.2, random_state=SEED, stratify=ser_ytrain_orig)

print(f"df             : {df.shape}")

print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")

print(f"\ndf_Xtrain      : {df_Xtrain.shape}")
print(f"ser_ytrain     : {ser_ytrain.shape}")

print(f"\ndf_Xvalid      : {df_Xvalid.shape}")
print(f"ser_yvalid     : {ser_yvalid.shape}")

print(f"\ndf_Xtest       : {df_Xtest.shape}")
print(f"ser_ytest      : {ser_ytest.shape}")

df_Xtrain_orig.head(2)

df             : (59381, 128)

df_Xtrain_orig : (47504, 127)
ser_ytrain_orig: (47504,)

df_Xtrain      : (38003, 127)
ser_ytrain     : (38003,)

df_Xvalid      : (9501, 127)
ser_yvalid     : (9501,)

df_Xtest       : (11877, 127)
ser_ytest      : (11877,)

Target Distribution¶

df.bp.plot_cat(target)

df.bp.plot_pareto(target)

"""
We can see class 8 has the highest distribution (33%), we can assume this as
the clean and accepted policy of insurance.

Other classes 1-7 can be assumed rejected or accepted with some conditions.


""";

df_desc = df.bp.get_column_descriptions(transpose=True)
df_desc

Create Binary Classification Variable¶

Here our target has 8 classes, looking the class distribution we see that class 8 has the highest counts, we can treat this as 1 and all others as 0.

cond = df[target] == 8
df['Response8'] = np.where(cond,1,0)

df.filter(regex='Resp').head()

Feature Engineering¶

Data Types¶

df.shape

(59381, 129)

df.select_dtypes('number').shape

(59381, 128)

df.select_dtypes('object').head()

"""
One of the feature Product_Info_2 is object.
Create new features from this.

""";

df['Product_Info_2_char'] = df['Product_Info_2'].str[0]
df['Product_Info_2_num'] = df['Product_Info_2'].str[1].astype(int)

df.Product_Info_2_num.isna().sum() # it's good we dont have nans.

0

df.filter(regex='_Info_2').head(2)

df = df.drop('Product_Info_2',axis=1)
df.filter(regex='_Info_2').head(2)

Continuous Variable: square, cube, multiplication¶

"""
Some important continuous features:
- age bmi ht wt

"""

df['Ins_Age_sq'] = df['Ins_Age'] * df['Ins_Age']
df['Ht_sq'] = df['Ht'] * df['Ht']
df['Wt_sq'] = df['Wt'] * df['Wt']
df['BMI_sq'] = df['BMI'] * df['BMI']

df['Ins_Age_cu'] = df['Ins_Age'] * df['Ins_Age'] * df['Ins_Age']
df['Ht_cu'] = df['Ht'] * df['Ht'] * df['Ht']
df['Wt_cu'] = df['Wt'] * df['Wt'] * df['Wt']
df['BMI_cu'] = df['BMI'] * df['BMI'] * df['BMI']

df['Age_Ht'] = df['Ins_Age'] * df['Ht']
df['Age_Wt'] = df['Ins_Age'] * df['Wt']
df['Age_BMI'] = df['Ins_Age'] * df['BMI']

df.iloc[:2,-10:]

Continuous Variable: Quantile Binning¶

def quantile_binning(ser):
    conditions = [
        (ser <= ser.quantile(0.25)),
        (ser > ser.quantile(0.25)) & (ser <= ser.quantile(0.75)),
        (ser > ser.quantile(0.75))]

    choices = ['low', 'medium', 'high']
    return np.select(conditions, choices)

df['Age_cat'] = quantile_binning(df['Ins_Age'])
df['Ht_cat'] = quantile_binning(df['Ht'])
df['Wt_cat'] = quantile_binning(df['Wt'])
df['BMI_cat'] = quantile_binning(df['BMI'])

quantile_cat = df.filter(regex='_cat$').columns.to_list()
print(quantile_cat)
df.filter(regex='_cat$').head(2)

['Age_cat', 'Ht_cat', 'Wt_cat', 'BMI_cat']

df_resp8_0 = df.loc[df['Response8']==0]
df_resp8_1 = df.loc[df['Response8']==1]
mapping = {'low':0, 'medium':1,'high':2}
target8 = "Response8"

plt.figure()
fig, axes = plt.subplots(2,2,figsize=(24,18))
for i,col in enumerate(quantile_cat):
    i+=1
    plt.subplot(2,2,i)

    x0 = df_resp8_0[col].replace(mapping).to_numpy()
    x1 = df_resp8_1[col].replace(mapping).to_numpy()

    a = x0 / np.dot(x0,x0)
    b = x1 / np.dot(x1,x1)
    # overlap = np.dot(a,b)

    sns.kdeplot(x0, bw=0.5,label=f"{target8} = 0",shade=1)
    sns.kdeplot(x1, bw=0.5,label=f"{target8} = 1",shade=1)

    plt.xlabel(col, fontsize=18)
    plt.legend(loc='upper right',fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

<Figure size 432x288 with 0 Axes>

sns.scatterplot(data=df,x='BMI',y='Wt',hue='Response8',alpha=1)
# when bmi is high and wt is high, applicant is most likely rejected.

<matplotlib.axes._subplots.AxesSubplot at 0x7febb0c85f90>

# df.bp.plot_cat_cat('Age_cat','Response8')
# df.bp.plot_cat_cat('Ht_cat','Response8')
# df.bp.plot_cat_cat('Wt_cat','Response8')
df.bp.plot_cat_cat('BMI_cat','Response8')

==================================================
Feature: **BMI_cat**
Overall Count: 
    medium: 50.04%
    low: 25.01%
    high: 24.94%

Total  **Response8_1** distribution:
    medium: 53.51%
    low: 45.08%
    high: 1.41%

Per BMI_cat  **Response8_1** distribution:
    low: 59.15%
    medium: 35.09%
    high: 1.86%

"""
Look at the top right figure.

Age: when age is high/medium ==> more rejection

Ht : when height_cat is not useful. for all category of ht, distribution is different.

Wt : Wt==high is important feature (almost all rejected)

BMI: bmi==high is important feature (almost all rejected)
""";

cat = 'BMI_cat'
df1 = df[[cat,'Response8']]
df1 = df1[df1[cat]=='high']
df1['Response8'].value_counts(normalize=True)

0    0.981433
1    0.018567
Name: Response8, dtype: float64

pd.crosstab(df['BMI_cat'],df['Response8'],normalize='index')

df.filter(regex='_cat$').head(2)

# when a feature is medium is medium, they may be less risky.
def risk_medium_bool(key):
    cond = (df[key] == 'medium')
    return np.where(cond,0,1)

df['risk_Age_medium_bool'] = risk_medium_bool('Age_cat')
df['risk_Ht_medium_bool'] = risk_medium_bool('Ht_cat')
df['risk_Wt_medium_bool'] = risk_medium_bool('Wt_cat')
df['risk_BMI_medium_bool'] = risk_medium_bool('BMI_cat')

def risk_at_least_one(key):
    cond = ((df['Age_cat'] == key) | 
            (df['Ht_cat']  == key) |
            (df['Wt_cat']  == key) |
            (df['BMI_cat'] == key))

    return np.where(cond,1,0)

df['risk_at_least_one_low'] = risk_at_least_one('low') # possible risk
df['risk_at_least_one_high'] = risk_at_least_one('high') # possible risk

cond = ((df['Age_cat'] == 'high') |
        (df['Wt_cat']  == 'high') |
        (df['BMI_cat'] == 'high'))

df['risk_extreme'] = np.where(cond,1,0)

df.filter(regex='_cat$|^risk_').head()

cols_risk_bool = df.filter(regex='^risk_.*_bool$').columns.to_list()
cols_risk_bool

['risk_Age_medium_bool',
 'risk_Ht_medium_bool',
 'risk_Wt_medium_bool',
 'risk_BMI_medium_bool']

df_resp8_0 = df.loc[df['Response8']==0]
df_resp8_1 = df.loc[df['Response8']==1]
target8 = "Response8"

plt.figure()
fig, axes = plt.subplots(2,2,figsize=(24,18))
for i,col in enumerate(cols_risk_bool):
    i+=1
    plt.subplot(2,2,i)

    x0 = df_resp8_0[col]
    x1 = df_resp8_1[col]

    sns.kdeplot(x0, bw=0.5,label=f"{target8} = 0",shade=1)
    sns.kdeplot(x1, bw=0.5,label=f"{target8} = 1",shade=1)

    plt.xlabel(col, fontsize=18)
    plt.legend(loc='upper right',fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

<Figure size 432x288 with 0 Axes>

# drop unwanted columns

# risk_Age_medium_bool and risk_Wt_medium_bool are useless
df = df.drop(['risk_Age_medium_bool','risk_Ht_medium_bool', 'risk_Wt_medium_bool'],axis=1)

Medical Keyword Features¶

df.filter(regex='Med').head(2)

df.filter(regex='Medical_Key').head(2)

"""
There are 48 medical keyword features.
We will create new features based on few statistics such as min, max, mean, count

""";

cols_med_kw = df.filter(regex='Medical_Keyword').columns
cols_med_kw

Index(['Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3',
       'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6',
       'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9',
       'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12',
       'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15',
       'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18',
       'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21',
       'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24',
       'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27',
       'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30',
       'Medical_Keyword_31', 'Medical_Keyword_32', 'Medical_Keyword_33',
       'Medical_Keyword_34', 'Medical_Keyword_35', 'Medical_Keyword_36',
       'Medical_Keyword_37', 'Medical_Keyword_38', 'Medical_Keyword_39',
       'Medical_Keyword_40', 'Medical_Keyword_41', 'Medical_Keyword_42',
       'Medical_Keyword_43', 'Medical_Keyword_44', 'Medical_Keyword_45',
       'Medical_Keyword_46', 'Medical_Keyword_47', 'Medical_Keyword_48'],
      dtype='object')

df['Medical_Keyword_count'] = df[cols_med_kw].sum(axis=1)
df['Medical_Keyword_min'] = df[cols_med_kw].min(axis=1)
df['Medical_Keyword_max'] = df[cols_med_kw].max(axis=1)
df['Medical_Keyword_mean'] = df[cols_med_kw].mean(axis=1)
df['Medical_Keyword_std'] = df[cols_med_kw].std(axis=1)
df['Medical_Keyword_skew'] = df[cols_med_kw].skew(axis=1)
df['Medical_Keyword_kurtosis'] = df[cols_med_kw].kurtosis(axis=1)
df['Medical_Keyword_median'] = df[cols_med_kw].median(axis=1)

cols_med_kw_agg = df.filter(regex='^Medical_Keyword_\D').columns.to_list()
cols_med_kw_agg

['Medical_Keyword_count',
 'Medical_Keyword_min',
 'Medical_Keyword_max',
 'Medical_Keyword_mean',
 'Medical_Keyword_std',
 'Medical_Keyword_skew',
 'Medical_Keyword_kurtosis',
 'Medical_Keyword_median']

df[cols_med_kw_agg].describe()

df[cols_med_kw_agg].nunique()

Medical_Keyword_count        16
Medical_Keyword_min           1
Medical_Keyword_max           2
Medical_Keyword_mean         16
Medical_Keyword_std          42
Medical_Keyword_skew         99
Medical_Keyword_kurtosis    122
Medical_Keyword_median        1
dtype: int64

# drop features that have only one value
cols_drop = df[cols_med_kw_agg].nunique().loc[lambda x: x==1].index.to_list()
cols_drop

['Medical_Keyword_min', 'Medical_Keyword_median']

df = df.drop(cols_drop, axis=1)

Target Based Features¶

There is high risk of data leakage using target based features, however, we can always use the approach of try and see.

df[cols_med_kw].head(2)

# df[cols_med_kw].apply(lambda x: x.nunique())
# all these features are binary.

# df['target_min_med_kw'] = df.groupby(cols_med_kw.to_list())[target].transform('min')
# df['target_mean_med_kw'] = df.groupby(cols_med_kw.to_list())[target].transform('mean')

# i don't like target based features.

Missing Values Imputation¶

df.isna().sum().sum()

393103

df['nan_count'] = df.isna().sum(axis=1)
df.iloc[:2,-2:]

df.bp.missing()

Number of missing values features: 13
cols_missing_high = ['Medical_History_10', 'Medical_History_32', 'Medical_History_24']
cols_missing_low = ['Medical_History_15', 'Family_Hist_5', 'Family_Hist_3', 'Family_Hist_2', 'Insurance_History_5', 'Family_Hist_4', 'Employment_Info_6', 'Medical_History_1', 'Employment_Info_4', 'Employment_Info_1']

# remove features with >= 80% missing values
cols_missing_high = ['Medical_History_10', 
                     'Medical_History_32',
                     'Medical_History_24']
df = df.drop(cols_missing_high,axis=1)

 cols_missing_low = ['Medical_History_15', 'Family_Hist_5', 
                     'Family_Hist_3', 'Family_Hist_2', 
                     'Insurance_History_5', 'Family_Hist_4', 
                     'Employment_Info_6', 'Medical_History_1',
                     'Employment_Info_4', 'Employment_Info_1']

df = df.fillna(-1)

df.isna().sum().sum()

0

Categorical Encoding¶

The following variables are all categorical (nominal):

Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41

The following variables are continuous:

Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5

The following variables are discrete:

Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32

Medical_Keyword_1-48 are dummy variables.

Quantitative variables can be classified as discrete or continuous.

Categorical variable definition

Categorical variables contain a finite number of categories or distinct groups. Categorical data might not have a logical order. For example, categorical predictors include gender, material type, and payment method.

Discrete variable definition
Discrete variables are numeric variables that have a countable number of values between any two values. A discrete variable is always numeric. For example, the number of customer complaints or the number of flaws or defects.

# df.dtypes.loc[lambda x: x=='object']

df.select_dtypes('object').head(2)

df['Product_Info_2_char'].nunique() # for small number we can use OHE

5

cols_discrete = ['Medical_History_1', 'Medical_History_10',
                     'Medical_History_15', 'Medical_History_24',
                     'Medical_History_32']

cols_discrete = [i for i in cols_discrete if i not in cols_missing_high]
df[cols_discrete].head()

df[cols_discrete].nunique()

Medical_History_1     172
Medical_History_15    242
dtype: int64

# df_copy = df.copy()

col = 'Medical_History_1'
# df[col].value_counts()

col = 'Medical_History_1'
top_col = df[col].value_counts()[lambda x: x> 10].index

df.loc[~df[col].isin(top_col),col] = -2

col = 'Medical_History_15'
# df[col].value_counts().head(200).reset_index()

# the value count is steadily decreasing, there is no obvious cut-off point.

cols_cat = ['Product_Info_1', 'Product_Info_2',
            'Product_Info_3', 'Product_Info_5',
            'Product_Info_6', 'Product_Info_7',
            'Employment_Info_2', 'Employment_Info_3',
            'Employment_Info_5', 'InsuredInfo_1',
            'InsuredInfo_2', 'InsuredInfo_3',
            'InsuredInfo_4', 'InsuredInfo_5',
            'InsuredInfo_6', 'InsuredInfo_7', 
            'Insurance_History_1', 'Insurance_History_2', 
            'Insurance_History_3', 'Insurance_History_4',
            'Insurance_History_7', 'Insurance_History_8',
            'Insurance_History_9', 'Family_Hist_1',
            'Medical_History_2', 'Medical_History_3',
            'Medical_History_4', 'Medical_History_5', 
            'Medical_History_6', 'Medical_History_7',
            'Medical_History_8', 'Medical_History_9', 
            'Medical_History_11', 'Medical_History_12',
            'Medical_History_13', 'Medical_History_14', 
            'Medical_History_16', 'Medical_History_17', 
            'Medical_History_18', 'Medical_History_19',
            'Medical_History_20', 'Medical_History_21',
            'Medical_History_22', 'Medical_History_23',
            'Medical_History_25', 'Medical_History_26', 
            'Medical_History_27', 'Medical_History_28', 
            'Medical_History_29', 'Medical_History_30', 
            'Medical_History_31', 'Medical_History_33',
            'Medical_History_34', 'Medical_History_35', 
            'Medical_History_36', 'Medical_History_37', 
            'Medical_History_38', 'Medical_History_39', 
            'Medical_History_40', 'Medical_History_41']

cols_cat = [i for i in cols_cat if i not in ['Product_Info_2']]
# df[cols_cat].nunique()

df[cols_cat].nunique().loc[lambda x: x>10]

Product_Info_3        34
Employment_Info_2     36
InsuredInfo_3         11
Medical_History_2    579
dtype: int64

# df['Medical_History_2'].value_counts().head(200)

col = 'Medical_History_2'
top_col = df[col].value_counts()[lambda x: x> 100].index
df.loc[~df[col].isin(top_col),col] = -2

# df.nunique().loc[lambda x: x>90]

cols = df.nunique().loc[lambda x: x>90].index.to_list()
# df[cols].dtypes
df[cols].head(2)

One hot encode categorical features¶

risk_cat = ['Age_cat', 'Ht_cat', 'Wt_cat', 'BMI_cat']
cols_cat_all = cols_cat + cols_discrete + risk_cat + ['Product_Info_2_char']
print(f"Number of categorical features: {len(cols_cat_all)}")

Number of categorical features: 66

# df[cols_cat].nunique()

print(f"Shape of df before encoding: {df.shape}")

Shape of df before encoding: (59381, 153)

df_encoded = pd.get_dummies(df,columns=cols_cat_all,drop_first=True)

print(f"Shape of df after encoding: {df_encoded.shape}")

Shape of df after encoding: (59381, 640)

df_encoded.select_dtypes('object').shape

(59381, 0)

df_encoded.sum().sum() # this needs to a number

2351103715.742992

Normalize Numerical Features¶

Here we already have normalized numerical features. I will not log transform and normalize the numerical features.

Correlations¶

df.filter(regex='Resp').head(2)

df.head(2)

cols_drop = ['Id','Response','Response8']

(df.drop(cols_drop,axis=1)
  .corrwith(df[target8]).abs()
  .to_frame()
  .sort_values([0],ascending=False).T)

Create features using unsupervied clustering algorithms¶

cols_drop = ['Id','Response','Response8']
cols_high_corr = (df.drop(cols_drop,axis=1)
                    .corrwith(df[target8]).abs()
                    .sort_values(ascending=False)
                    .head(20)
                    .index.to_list()
                    )

print(cols_high_corr)

['BMI', 'Wt', 'BMI_sq', 'risk_extreme', 'Wt_sq', 'risk_at_least_one_high', 'BMI_cu', 'Age_Wt', 'Age_BMI', 'Wt_cu', 'Medical_History_23', 'Medical_Keyword_15', 'Medical_Keyword_std', 'Medical_Keyword_mean', 'Medical_Keyword_count', 'Medical_History_4', 'Medical_Keyword_max', 'risk_at_least_one_low', 'Age_Ht', 'Ins_Age']

df_high_corr = df[cols_high_corr]
df_high_corr.head(2)

Gaussian Mixture¶

from sklearn.mixture import GaussianMixture

model = GaussianMixture(n_components=len(cols_high_corr), random_state=SEED, reg_covar=1e-3)
model.fit(df_high_corr)

df['cluster_gmix20'] = model.predict(df_high_corr)

df.filter(regex='cluster').head(2)

gc.collect()

64529

Birch Clustering¶

from sklearn.cluster import Birch

model = Birch(n_clusters=len(cols_high_corr))
model

Birch(n_clusters=20)

df['cluster_birch20'] = model.fit(df_high_corr).labels_

df.filter(regex='cluster').head(2)

gc.collect()

20

Two features Birch Clusterings¶

model_birch2 = Birch(n_clusters=2)
model_birch2

Birch(n_clusters=2)

bmi_cols = ['Ins_Age','Wt','Ht',
            'Medical_Keyword_15',
            'Medical_Keyword_count']

for col in bmi_cols:
    dfx = df[['BMI',col]].reset_index(drop=True)
    dfx.columns = ['BMI',col]
    df[f'cluster2_BMI_vs_{col}'] = model_birch2.fit(dfx).labels_

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

cols_first = ['Medical_History_4','Product_Info_4','Age_BMI','Age_BMI']
cols_second = ['Medical_History_23','InsuredInfo_6','Age_Ht','Age_Wt']

for col1, col2 in zip(cols_first, cols_second):
    dfx = df[[col1,col2]].reset_index(drop=True)
    dfx.columns = [col1,col2]

    df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning:

Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.

cols_risk_cluster = ['risk_at_least_one_low',
                     'risk_at_least_one_high',
                     'risk_extreme']

cols_risk_cluster_second = ['Medical_History_23', 'Medical_History_4',
                    'Medical_Keyword_15','Medical_Keyword_count',
                    'Product_Info_4','InsuredInfo_6','BMI']

for col1 in cols_risk_cluster:
    for col2 in cols_risk_cluster_second:
        dfx = df[[col1,col2]].reset_index(drop=True)
        dfx.columns = [col1,col2]
        df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_

df.filter(regex='cluster').head(2)

# check if the feature has only one value
df.filter(regex='cluster').nunique().loc[lambda x: x==1]

cluster2_BMI_vs_Ins_Age               1
cluster2_BMI_vs_Wt                    1
cluster2_BMI_vs_Ht                    1
cluster2_BMI_vs_Medical_Keyword_15    1
cluster2_Age_BMI_vs_Age_Ht            1
cluster2_Age_BMI_vs_Age_Wt            1
dtype: int64

cols_drop = df.filter(regex='cluster').nunique().loc[lambda x: x==1].index.to_list()
print(cols_drop)

['cluster2_BMI_vs_Ins_Age', 'cluster2_BMI_vs_Wt', 'cluster2_BMI_vs_Ht', 'cluster2_BMI_vs_Medical_Keyword_15', 'cluster2_Age_BMI_vs_Age_Ht', 'cluster2_Age_BMI_vs_Age_Wt']

df = df.drop(cols_drop,axis=1)
df.nunique().loc[lambda x: x==1]

Series([], dtype: int64)

Remove Highly Correlated Features¶

import json

df.shape

(59381, 179)

def get_high_correlated_features(df, thr=0.95):
    cols_corr = set()
    df_corr = df.corr()
    for i in range(len(df_corr.columns)):
        for j in range(i):
            if ( abs(df_corr.iloc[i, j]) >= thr) and (df_corr.columns[j] not in cols_corr):
                colname = df_corr.columns[i]
                cols_corr.add(colname)

    return cols_corr

cols_high_corr = get_high_correlated_features(df,thr=0.95)
print(cols_high_corr)

{'Medical_History_36', 'Insurance_History_5', 'cluster2_risk_extreme_vs_Medical_History_4', 'cluster2_Product_Info_4_vs_InsuredInfo_6', 'cluster2_risk_at_least_one_high_vs_BMI', 'Medical_History_26', 'Medical_Keyword_23', 'cluster2_risk_at_least_one_low_vs_Product_Info_4', 'cluster2_Medical_History_4_vs_Medical_History_23', 'Age_BMI', 'Ht_cu', 'Age_Ht', 'cluster2_risk_at_least_one_low_vs_BMI', 'cluster2_risk_at_least_one_high_vs_Product_Info_4', 'cluster2_risk_extreme_vs_Product_Info_4', 'cluster2_risk_extreme_vs_Medical_History_23', 'Ins_Age_sq', 'cluster2_risk_at_least_one_high_vs_InsuredInfo_6', 'Medical_Keyword_mean', 'cluster2_risk_extreme_vs_Medical_Keyword_15', 'Medical_Keyword_11', 'cluster2_risk_at_least_one_low_vs_Medical_History_23', 'Ht_sq', 'cluster2_risk_extreme_vs_Medical_Keyword_count', 'cluster2_risk_extreme_vs_BMI', 'cluster2_risk_at_least_one_high_vs_Medical_History_4', 'cluster2_risk_at_least_one_high_vs_Medical_Keyword_15', 'cluster2_risk_at_least_one_low_vs_InsuredInfo_6', 'BMI_sq', 'Insurance_History_7', 'cluster2_risk_at_least_one_high_vs_Medical_History_23', 'Wt_sq', 'cluster2_risk_extreme_vs_InsuredInfo_6', 'cluster2_risk_at_least_one_low_vs_Medical_History_4', 'Medical_Keyword_48', 'Medical_Keyword_kurtosis', 'Insurance_History_9', 'cluster2_risk_at_least_one_low_vs_Medical_Keyword_15'}

print(len(cols_high_corr))

38

out_json = out_dir + 'correlated_features.json'
print(out_json)

../outputs/Prudential/correlated_features.json

with open(out_json,'w') as fo:
    json.dump(out_json,fo)

df = df.drop(cols_high_corr,axis=1)
df.shape

(59381, 141)

Time Taken¶

time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 1 min 34 secs

	47	69	61	52	37	35	34	29	36	17	38	15	12	91	87	116	113	96	122	92	98	124	83	80	95	97	90	119	86	114	121	109	107	105	84	94	104	117	123	85	82	99	106	102	125	112	110	111	108	88	100	79	120	81	126	118	89	115	103	101	93	4	8	9	10	11	0	1	2	3	5	6	7	13	14	16	18	19	20	21	22	23	24	25	26	27	28	30	31	32	33	39	40	41	42	43	44	45	46	48	49	50	51	53	54	55	56	57	58	59	60	62	63	64	65	66	67	68	70	71	72	73	74	75	76	77	78	127
Feature	Medical_History_10	Medical_History_32	Medical_History_24	Medical_History_15	Family_Hist_5	Family_Hist_3	Family_Hist_2	Insurance_History_5	Family_Hist_4	Employment_Info_6	Medical_History_1	Employment_Info_4	Employment_Info_1	Medical_Keyword_13	Medical_Keyword_9	Medical_Keyword_38	Medical_Keyword_35	Medical_Keyword_18	Medical_Keyword_44	Medical_Keyword_14	Medical_Keyword_20	Medical_Keyword_46	Medical_Keyword_5	Medical_Keyword_2	Medical_Keyword_17	Medical_Keyword_19	Medical_Keyword_12	Medical_Keyword_41	Medical_Keyword_8	Medical_Keyword_36	Medical_Keyword_43	Medical_Keyword_31	Medical_Keyword_29	Medical_Keyword_27	Medical_Keyword_6	Medical_Keyword_16	Medical_Keyword_26	Medical_Keyword_39	Medical_Keyword_45	Medical_Keyword_7	Medical_Keyword_4	Medical_Keyword_21	Medical_Keyword_28	Medical_Keyword_24	Medical_Keyword_47	Medical_Keyword_34	Medical_Keyword_32	Medical_Keyword_33	Medical_Keyword_30	Medical_Keyword_10	Medical_Keyword_22	Medical_Keyword_1	Medical_Keyword_42	Medical_Keyword_3	Medical_Keyword_48	Medical_Keyword_40	Medical_Keyword_11	Medical_Keyword_37	Medical_Keyword_25	Medical_Keyword_23	Medical_Keyword_15	Product_Info_4	Ins_Age	Ht	Wt	BMI	Id	Product_Info_1	Product_Info_2	Product_Info_3	Product_Info_5	Product_Info_6	Product_Info_7	Employment_Info_2	Employment_Info_3	Employment_Info_5	InsuredInfo_1	InsuredInfo_2	InsuredInfo_3	InsuredInfo_4	InsuredInfo_5	InsuredInfo_6	InsuredInfo_7	Insurance_History_1	Insurance_History_2	Insurance_History_3	Insurance_History_4	Insurance_History_7	Insurance_History_8	Insurance_History_9	Family_Hist_1	Medical_History_2	Medical_History_3	Medical_History_4	Medical_History_5	Medical_History_6	Medical_History_7	Medical_History_8	Medical_History_9	Medical_History_11	Medical_History_12	Medical_History_13	Medical_History_14	Medical_History_16	Medical_History_17	Medical_History_18	Medical_History_19	Medical_History_20	Medical_History_21	Medical_History_22	Medical_History_23	Medical_History_25	Medical_History_26	Medical_History_27	Medical_History_28	Medical_History_29	Medical_History_30	Medical_History_31	Medical_History_33	Medical_History_34	Medical_History_35	Medical_History_36	Medical_History_37	Medical_History_38	Medical_History_39	Medical_History_40	Medical_History_41	Response
Type	float64	float64	float64	float64	float64	float64	float64	float64	float64	float64	float64	float64	float64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	float64	float64	float64	float64	float64	int64	int64	object	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64	int64
Count	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381	59381
Unique	103	95	227	241	90	90	68	2265	68	992	171	871	1936	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	1491	65	39	300	3256	59381	2	19	34	2	2	3	36	2	2	3	2	11	2	2	2	2	2	3	3	3	3	3	3	3	579	3	2	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	2	3	3	3	3	3	3	3	3	2	3	3	3	3	2	3	3	3	8
Missing	58824	58274	55580	44596	41811	34241	28656	25396	19184	10854	8889	6779	19	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
MissingPct	99.060000	98.140000	93.600000	75.100000	70.410000	57.660000	48.260000	42.770000	32.310000	18.280000	14.970000	11.420000	0.030000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
Zeros	75	744	769	2135	3	4	1	19	9	4042	4789	44659	3688	59027	58986	58975	58969	58936	58934	58915	58898	58877	58869	58850	58837	58829	58787	58784	58763	58763	58745	58734	58683	58677	58633	58626	58583	58570	58568	58555	58517	58514	58494	58259	58199	58155	58124	58025	57894	57216	57174	56887	56677	56455	56145	55999	55936	55427	54069	53575	48071	1036	287	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
ZerosPct	0.130000	1.250000	1.300000	3.600000	0.010000	0.010000	0.000000	0.030000	0.020000	6.810000	8.060000	75.210000	6.210000	99.400000	99.330000	99.320000	99.310000	99.250000	99.250000	99.220000	99.190000	99.150000	99.140000	99.110000	99.080000	99.070000	99.000000	98.990000	98.960000	98.960000	98.930000	98.910000	98.820000	98.810000	98.740000	98.730000	98.660000	98.630000	98.630000	98.610000	98.540000	98.540000	98.510000	98.110000	98.010000	97.940000	97.880000	97.720000	97.500000	96.350000	96.280000	95.800000	95.450000	95.070000	94.550000	94.300000	94.200000	93.340000	91.050000	90.220000	80.950000	1.740000	0.480000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
count	557.000000	1107.000000	3801.000000	14785.000000	17570.000000	25140.000000	30725.000000	33985.000000	40197.000000	48527.000000	50492.000000	52602.000000	59362.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000		59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000
mean	141.118492	11.965673	50.635622	123.760974	0.484635	0.497737	0.474550	0.001733	0.444890	0.361469	7.962172	0.006283	0.077582	0.005962	0.006652	0.006837	0.006938	0.007494	0.007528	0.007848	0.008134	0.008488	0.008622	0.008942	0.009161	0.009296	0.010003	0.010054	0.010407	0.010407	0.010710	0.010896	0.011755	0.011856	0.012597	0.012715	0.013439	0.013658	0.013691	0.013910	0.014550	0.014601	0.014937	0.018895	0.019905	0.020646	0.021168	0.022836	0.025042	0.036459	0.037167	0.042000	0.045536	0.049275	0.054496	0.056954	0.058015	0.066587	0.089456	0.097775	0.190465	0.328952	0.405567	0.707283	0.292587	0.469462	39507.211515	1.026355		24.415655	2.006955	2.673599	1.043583	8.641821	1.300904	2.142958	1.209326	2.007427	5.835840	2.883666	1.027180	1.409188	1.038531	1.727606	1.055792	2.146983	1.958707	1.901989	2.048484	2.419360	2.686230	253.987100	2.102171	1.654873	1.007359	2.889897	2.012277	2.044088	1.769943	2.993836	2.056601	2.768141	2.968542	1.327529	2.978006	1.053536	1.034455	1.985079	1.108991	1.981644	2.528115	1.194961	2.808979	2.980213	1.067210	2.542699	2.040771	2.985265	2.804618	2.689076	1.002055	2.179468	1.938398	1.004850	2.830720	2.967599	1.641064	5.636837
std	107.759559	38.718774	78.149069	98.516206	0.129200	0.140187	0.154959	0.007338	0.163012	0.349551	13.027697	0.032816	0.082347	0.076981	0.081289	0.082405	0.083007	0.086244	0.086436	0.088239	0.089821	0.091737	0.092456	0.094141	0.095275	0.095967	0.099515	0.099764	0.101485	0.101485	0.102937	0.103813	0.107780	0.108237	0.111526	0.112040	0.115145	0.116066	0.116207	0.117119	0.119744	0.119949	0.121304	0.136155	0.139676	0.142198	0.143947	0.149380	0.156253	0.187432	0.189172	0.200591	0.208479	0.216443	0.226995	0.231757	0.233774	0.249307	0.285404	0.297013	0.392671	0.282562	0.197190	0.074239	0.089037	0.122213	22815.883089	0.160191		5.072885	0.083107	0.739103	0.291949	4.227082	0.715034	0.350033	0.417939	0.085858	2.674536	0.320627	0.231566	0.491688	0.274915	0.445195	0.329328	0.989139	0.945739	0.971223	0.755149	0.509577	0.483159	178.621154	0.303098	0.475414	0.085864	0.456128	0.172360	0.291353	0.421032	0.095340	0.231153	0.640259	0.197715	0.740118	0.146778	0.225848	0.182859	0.121375	0.311847	0.134236	0.849170	0.406082	0.393237	0.197652	0.250589	0.839904	0.198100	0.170989	0.593798	0.724661	0.063806	0.412633	0.240574	0.069474	0.556665	0.252427	0.933361	2.456833
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	1.000000		1.000000	2.000000	1.000000	1.000000	1.000000	1.000000	2.000000	1.000000	2.000000	1.000000	2.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
25%	8.000000	0.000000	1.000000	17.000000	0.401786	0.401961	0.362319	0.000400	0.323944	0.060000	2.000000	0.000000	0.035000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.076923	0.238806	0.654545	0.225941	0.385517	19780.000000	1.000000		26.000000	2.000000	3.000000	1.000000	9.000000	1.000000	2.000000	1.000000	2.000000	3.000000	3.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	2.000000	2.000000	112.000000	2.000000	1.000000	1.000000	3.000000	2.000000	2.000000	2.000000	3.000000	2.000000	3.000000	3.000000	1.000000	3.000000	1.000000	1.000000	2.000000	1.000000	2.000000	3.000000	1.000000	3.000000	3.000000	1.000000	3.000000	2.000000	3.000000	3.000000	3.000000	1.000000	2.000000	2.000000	1.000000	3.000000	3.000000	1.000000	4.000000
50%	229.000000	0.000000	8.000000	117.000000	0.508929	0.519608	0.463768	0.000973	0.422535	0.250000	4.000000	0.000000	0.060000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.230769	0.402985	0.709091	0.288703	0.451349	39487.000000	1.000000		26.000000	2.000000	3.000000	1.000000	9.000000	1.000000	2.000000	1.000000	2.000000	6.000000	3.000000	1.000000	1.000000	1.000000	2.000000	1.000000	3.000000	2.000000	1.000000	2.000000	2.000000	3.000000	162.000000	2.000000	2.000000	1.000000	3.000000	2.000000	2.000000	2.000000	3.000000	2.000000	3.000000	3.000000	1.000000	3.000000	1.000000	1.000000	2.000000	1.000000	2.000000	3.000000	1.000000	3.000000	3.000000	1.000000	3.000000	2.000000	3.000000	3.000000	3.000000	1.000000	2.000000	2.000000	1.000000	3.000000	3.000000	1.000000	6.000000
75%	240.000000	2.000000	64.000000	240.000000	0.580357	0.598039	0.579710	0.002000	0.563380	0.550000	9.000000	0.000000	0.100000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.487179	0.567164	0.763636	0.345188	0.532858	59211.000000	1.000000		26.000000	2.000000	3.000000	1.000000	9.000000	1.000000	2.000000	1.000000	2.000000	8.000000	3.000000	1.000000	2.000000	1.000000	2.000000	1.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	418.000000	2.000000	2.000000	1.000000	3.000000	2.000000	2.000000	2.000000	3.000000	2.000000	3.000000	3.000000	1.000000	3.000000	1.000000	1.000000	2.000000	1.000000	2.000000	3.000000	1.000000	3.000000	3.000000	1.000000	3.000000	2.000000	3.000000	3.000000	3.000000	1.000000	2.000000	2.000000	1.000000	3.000000	3.000000	3.000000	8.000000
max	240.000000	240.000000	240.000000	240.000000	1.000000	1.000000	1.000000	1.000000	0.943662	1.000000	240.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	79146.000000	2.000000		38.000000	3.000000	3.000000	3.000000	38.000000	3.000000	3.000000	3.000000	3.000000	11.000000	3.000000	3.000000	2.000000	3.000000	2.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	648.000000	3.000000	2.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	2.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	3.000000	2.000000	3.000000	3.000000	3.000000	8.000000

	Product_Info_2
0	D3
1	A1
2	E1
3	D4
4	D2

	Ht_sq	Wt_sq	BMI_sq	Ins_Age_cu	Ht_cu	Wt_cu	BMI_cu	Age_Ht	Age_Wt	Age_BMI
0	0.338512	0.022063	0.104334	0.264351	0.196953	0.003277	0.033701	0.373406	0.095329	0.207304
1	0.360000	0.017371	0.074141	0.000213	0.216000	0.002289	0.020188	0.035821	0.007869	0.016256

	Medical_Keyword_count	Medical_Keyword_min	Medical_Keyword_max	Medical_Keyword_mean	Medical_Keyword_std	Medical_Keyword_skew	Medical_Keyword_kurtosis	Medical_Keyword_median
count	59381.000000	59381.0	59381.000000	59381.000000	59381.000000	59381.000000	59381.000000	59381.0
mean	1.264765	0.0	0.603931	0.026349	0.118569	3.201537	18.044050	0.0
std	1.480236	0.0	0.489083	0.030838	0.105694	2.872417	19.718855	0.0
min	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	-1.532609	0.0
25%	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0
50%	1.000000	0.0	1.000000	0.020833	0.144338	3.732451	12.448567	0.0
75%	2.000000	0.0	1.000000	0.041667	0.201941	6.928203	48.000000	0.0
max	16.000000	0.0	1.000000	0.333333	0.476393	6.928203	48.000000	0.0

	Feature	Type	Count	Missing	Zeros	Unique	MissingPct	ZerosPct	count	mean	std	25%	50%	75%	max
46	Medical_History_10	float64	59381	58824	75	103	99.060000	0.130000	557.000000	141.118492	107.759559	8.000000	229.000000	240.000000	240.000000
68	Medical_History_32	float64	59381	58274	744	95	98.140000	1.250000	1107.000000	11.965673	38.718774	0.000000	0.000000	2.000000	240.000000
60	Medical_History_24	float64	59381	55580	769	227	93.600000	1.300000	3801.000000	50.635622	78.149069	1.000000	8.000000	64.000000	240.000000
51	Medical_History_15	float64	59381	44596	2135	241	75.100000	3.600000	14785.000000	123.760974	98.516206	17.000000	117.000000	240.000000	240.000000
36	Family_Hist_5	float64	59381	41811	3	90	70.410000	0.010000	17570.000000	0.484635	0.129200	0.401786	0.508929	0.580357	1.000000
34	Family_Hist_3	float64	59381	34241	4	90	57.660000	0.010000	25140.000000	0.497737	0.140187	0.401961	0.519608	0.598039	1.000000
33	Family_Hist_2	float64	59381	28656	1	68	48.260000	0.000000	30725.000000	0.474550	0.154959	0.362319	0.463768	0.579710	1.000000
28	Insurance_History_5	float64	59381	25396	19	2265	42.770000	0.030000	33985.000000	0.001733	0.007338	0.000400	0.000973	0.002000	1.000000
35	Family_Hist_4	float64	59381	19184	9	68	32.310000	0.020000	40197.000000	0.444890	0.163012	0.323944	0.422535	0.563380	0.943662
16	Employment_Info_6	float64	59381	10854	4042	992	18.280000	6.810000	48527.000000	0.361469	0.349551	0.060000	0.250000	0.550000	1.000000
37	Medical_History_1	float64	59381	8889	4789	171	14.970000	8.060000	50492.000000	7.962172	13.027697	2.000000	4.000000	9.000000	240.000000
14	Employment_Info_4	float64	59381	6779	44659	871	11.420000	75.210000	52602.000000	0.006283	0.032816	0.000000	0.000000	0.000000	1.000000
11	Employment_Info_1	float64	59381	19	3688	1936	0.030000	6.210000	59362.000000	0.077582	0.082347	0.035000	0.060000	0.100000	1.000000

	Id	Product_Info_1	Product_Info_2	Product_Info_3	Product_Info_4	Product_Info_5	Product_Info_6	Product_Info_7	Ins_Age	Ht	Wt	BMI	Employment_Info_1	Employment_Info_2	Employment_Info_3	Employment_Info_5	Employment_Info_6	InsuredInfo_1	InsuredInfo_2	InsuredInfo_3	InsuredInfo_4	InsuredInfo_5	InsuredInfo_6	InsuredInfo_7	Insurance_History_1	Insurance_History_2	Insurance_History_3	Insurance_History_4	Insurance_History_5	Insurance_History_7	Insurance_History_8	Insurance_History_9	Family_Hist_1	Family_Hist_2	Family_Hist_3	Family_Hist_4	Family_Hist_5	Medical_History_1	Medical_History_2	Medical_History_3	Medical_History_4	Medical_History_5	Medical_History_6	Medical_History_7	Medical_History_8	Medical_History_9	Medical_History_10	Medical_History_11	Medical_History_12	Medical_History_13	Medical_History_14	Medical_History_15	Medical_History_16	Medical_History_17	Medical_History_18	Medical_History_19	Medical_History_20	Medical_History_21	Medical_History_22	Medical_History_23	Medical_History_24	Medical_History_25	Medical_History_26	Medical_History_27	Medical_History_28	Medical_History_29	Medical_History_30	Medical_History_31	Medical_History_32	Medical_History_33	Medical_History_34	Medical_History_35	Medical_History_36	Medical_History_37	Medical_History_38	Medical_History_39	Medical_History_40	Medical_History_41	Medical_Keyword_32	Response
0	2	1	D3	10	0.076923	2	1	1	0.641791	0.581818	0.148536	0.323008	0.028	12	1	3	NaN	1	2	6	3	1	2	1	1	1	3	1	0.000667	1	1	2	2	NaN	0.598039	NaN	0.526786	4.0	112	2	1	1	3	2	2	1	NaN	3	2	3	3	240.0	3	3	1	1	2	1	2	3	NaN	1	3	3	1	3	2	3	NaN	1	3	1	2	2	1	3	3	3	0	8
1	5	1	A1	26	0.076923	2	3	1	0.059701	0.600000	0.131799	0.272288	0.000	1	3	2	0.0018	1	2	6	3	1	2	1	2	1	3	1	0.000133	1	3	2	2	0.188406	NaN	0.084507	NaN	5.0	412	2	1	1	3	2	2	1	NaN	3	2	3	3	0.0	1	3	1	1	2	1	2	3	NaN	1	3	3	1	3	2	3	NaN	3	1	1	2	2	1	3	3	1	0	4
2	6	1	E1	26	0.076923	2	3	1	0.029851	0.745455	0.288703	0.428780	0.030	9	1	2	0.0300	1	2	8	3	1	1	1	2	1	1	3	NaN	3	2	3	3	0.304348	NaN	0.225352	NaN	10.0	3	2	2	1	3	2	2	2	NaN	3	2	3	3	NaN	1	3	1	1	2	1	2	3	NaN	2	2	3	1	3	2	3	NaN	3	3	1	3	2	1	3	3	1	0	8
3	7	1	D4	10	0.487179	2	3	1	0.164179	0.672727	0.205021	0.352438	0.042	9	1	3	0.2000	2	2	8	3	1	2	1	2	1	1	3	NaN	3	2	3	3	0.420290	NaN	0.352113	NaN	0.0	350	2	2	1	3	2	2	2	NaN	3	2	3	3	NaN	1	3	1	1	2	2	2	3	NaN	1	3	3	1	3	2	3	NaN	3	3	1	2	2	1	3	3	1	1	8
4	8	1	D2	26	0.230769	2	3	1	0.417910	0.654545	0.234310	0.424046	0.027	9	1	2	0.0500	1	2	6	3	1	2	1	2	1	1	3	NaN	3	2	3	2	0.463768	NaN	0.408451	NaN	NaN	162	2	2	1	3	2	2	2	NaN	3	2	3	3	NaN	1	3	1	1	2	1	2	3	NaN	2	2	3	1	3	2	3	NaN	3	3	1	3	2	1	3	3	1	0	8

	Id	Product_Info_1	Product_Info_2	Product_Info_3	Product_Info_4	Product_Info_5	Product_Info_6	Product_Info_7	Ins_Age	Ht	Wt	BMI	Employment_Info_1	Employment_Info_2	Employment_Info_3	Employment_Info_4	Employment_Info_5	Employment_Info_6	InsuredInfo_1	InsuredInfo_2	InsuredInfo_3	InsuredInfo_4	InsuredInfo_5	InsuredInfo_6	InsuredInfo_7	Insurance_History_1	Insurance_History_2	Insurance_History_3	Insurance_History_4	Insurance_History_5	Insurance_History_7	Insurance_History_8	Insurance_History_9	Family_Hist_1	Family_Hist_2	Family_Hist_3	Family_Hist_4	Family_Hist_5	Medical_History_1	Medical_History_2	Medical_History_3	Medical_History_4	Medical_History_5	Medical_History_6	Medical_History_7	Medical_History_8	Medical_History_9	Medical_History_10	Medical_History_11	Medical_History_12	Medical_History_13	Medical_History_14	Medical_History_15	Medical_History_16	Medical_History_17	Medical_History_18	Medical_History_19	Medical_History_20	Medical_History_21	Medical_History_22	Medical_History_23	Medical_History_24	Medical_History_25	Medical_History_26	Medical_History_27	Medical_History_28	Medical_History_29	Medical_History_30	Medical_History_31	Medical_History_32	Medical_History_33	Medical_History_34	Medical_History_35	Medical_History_36	Medical_History_37	Medical_History_38	Medical_History_39	Medical_History_40	Medical_History_41	Medical_Keyword_1	Medical_Keyword_2	Medical_Keyword_3	Medical_Keyword_4	Medical_Keyword_5	Medical_Keyword_6	Medical_Keyword_7	Medical_Keyword_8	Medical_Keyword_9	Medical_Keyword_10	Medical_Keyword_11	Medical_Keyword_12	Medical_Keyword_13	Medical_Keyword_14	Medical_Keyword_15	Medical_Keyword_16	Medical_Keyword_17	Medical_Keyword_18	Medical_Keyword_19	Medical_Keyword_20	Medical_Keyword_21	Medical_Keyword_22	Medical_Keyword_23	Medical_Keyword_24	Medical_Keyword_25	Medical_Keyword_26	Medical_Keyword_27	Medical_Keyword_28	Medical_Keyword_29	Medical_Keyword_30	Medical_Keyword_31	Medical_Keyword_32	Medical_Keyword_33	Medical_Keyword_34	Medical_Keyword_35	Medical_Keyword_36	Medical_Keyword_37	Medical_Keyword_38	Medical_Keyword_39	Medical_Keyword_40	Medical_Keyword_41	Medical_Keyword_42	Medical_Keyword_43	Medical_Keyword_44	Medical_Keyword_45	Medical_Keyword_46	Medical_Keyword_47	Medical_Keyword_48
616	840	1	A3	26	0.230769	2	3	1	0.059701	0.727273	0.225941	0.341911	0.03	9	1	0.0	2	0.015	1	2	3	3	1	1	1	2	1	1	3	NaN	3	2	3	3	0.376812	NaN	0.366197	NaN	NaN	162	2	2	1	3	2	2	2	NaN	3	2	3	3	NaN	1	3	1	1	2	1	2	3	NaN	1	3	3	1	3	2	3	NaN	3	3	1	2	2	1	3	3	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3239	4322	1	D3	26	0.230769	2	3	1	0.417910	0.654545	0.209205	0.376858	0.09	14	1	0.0	2	0.600	1	2	8	3	1	2	1	2	1	3	1	0.000333	1	3	2	2	0.710145	NaN	NaN	0.419643	3.0	413	2	1	1	3	2	2	1	NaN	3	2	3	3	163.0	1	3	1	1	2	2	2	1	NaN	1	3	3	1	1	2	3	NaN	3	3	1	2	2	1	3	3	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

Response8	0	1
BMI_cat
high	0.981433	0.018567
low	0.408470	0.591530
medium	0.649090	0.350910

	Medical_Keyword_1	Medical_Keyword_2	Medical_Keyword_3	Medical_Keyword_4	Medical_Keyword_5	Medical_Keyword_6	Medical_Keyword_7	Medical_Keyword_8	Medical_Keyword_9	Medical_Keyword_10	Medical_Keyword_11	Medical_Keyword_12	Medical_Keyword_13	Medical_Keyword_14	Medical_Keyword_15	Medical_Keyword_16	Medical_Keyword_17	Medical_Keyword_18	Medical_Keyword_19	Medical_Keyword_20	Medical_Keyword_21	Medical_Keyword_22	Medical_Keyword_23	Medical_Keyword_24	Medical_Keyword_25	Medical_Keyword_26	Medical_Keyword_27	Medical_Keyword_28	Medical_Keyword_29	Medical_Keyword_30	Medical_Keyword_31	Medical_Keyword_32	Medical_Keyword_33	Medical_Keyword_34	Medical_Keyword_35	Medical_Keyword_36	Medical_Keyword_37	Medical_Keyword_38	Medical_Keyword_39	Medical_Keyword_40	Medical_Keyword_41	Medical_Keyword_42	Medical_Keyword_43	Medical_Keyword_44	Medical_Keyword_45	Medical_Keyword_46	Medical_Keyword_47	Medical_Keyword_48
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	Medical_Keyword_1	Medical_Keyword_2	Medical_Keyword_3	Medical_Keyword_4	Medical_Keyword_5	Medical_Keyword_6	Medical_Keyword_7	Medical_Keyword_8	Medical_Keyword_9	Medical_Keyword_10	Medical_Keyword_11	Medical_Keyword_12	Medical_Keyword_13	Medical_Keyword_14	Medical_Keyword_15	Medical_Keyword_16	Medical_Keyword_17	Medical_Keyword_18	Medical_Keyword_19	Medical_Keyword_20	Medical_Keyword_21	Medical_Keyword_22	Medical_Keyword_23	Medical_Keyword_24	Medical_Keyword_25	Medical_Keyword_26	Medical_Keyword_27	Medical_Keyword_28	Medical_Keyword_29	Medical_Keyword_30	Medical_Keyword_31	Medical_Keyword_32	Medical_Keyword_33	Medical_Keyword_34	Medical_Keyword_35	Medical_Keyword_36	Medical_Keyword_37	Medical_Keyword_38	Medical_Keyword_39	Medical_Keyword_40	Medical_Keyword_41	Medical_Keyword_42	Medical_Keyword_43	Medical_Keyword_44	Medical_Keyword_45	Medical_Keyword_46	Medical_Keyword_47	Medical_Keyword_48
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	cluster_gmix20	cluster_birch20	cluster2_BMI_vs_Ins_Age	cluster2_BMI_vs_Wt	cluster2_BMI_vs_Ht	cluster2_BMI_vs_Medical_Keyword_15	cluster2_BMI_vs_Medical_Keyword_count	cluster2_Medical_History_4_vs_Medical_History_23	cluster2_Product_Info_4_vs_InsuredInfo_6	cluster2_Age_BMI_vs_Age_Ht	cluster2_Age_BMI_vs_Age_Wt	cluster2_risk_at_least_one_low_vs_Medical_History_23	cluster2_risk_at_least_one_low_vs_Medical_History_4	cluster2_risk_at_least_one_low_vs_Medical_Keyword_15	cluster2_risk_at_least_one_low_vs_Medical_Keyword_count	cluster2_risk_at_least_one_low_vs_Product_Info_4	cluster2_risk_at_least_one_low_vs_InsuredInfo_6	cluster2_risk_at_least_one_low_vs_BMI	cluster2_risk_at_least_one_high_vs_Medical_History_23	cluster2_risk_at_least_one_high_vs_Medical_History_4	cluster2_risk_at_least_one_high_vs_Medical_Keyword_15	cluster2_risk_at_least_one_high_vs_Medical_Keyword_count	cluster2_risk_at_least_one_high_vs_Product_Info_4	cluster2_risk_at_least_one_high_vs_InsuredInfo_6	cluster2_risk_at_least_one_high_vs_BMI	cluster2_risk_extreme_vs_Medical_History_23	cluster2_risk_extreme_vs_Medical_History_4	cluster2_risk_extreme_vs_Medical_Keyword_15	cluster2_risk_extreme_vs_Medical_Keyword_count	cluster2_risk_extreme_vs_Product_Info_4	cluster2_risk_extreme_vs_InsuredInfo_6	cluster2_risk_extreme_vs_BMI
0	5	13	0	0	0	0	1	1	1	0	0	1	1	1	1	1	1	1	1	1	0	1	0	1	1	1	1	1	1	0	1	1
1	10	13	0	0	0	0	1	1	1	0	0	1	1	1	1	1	1	1	1	1	0	1	1	1	0	1	1	1	1	1	1	0

	Product_Info_2	Employment_Info_2	Product_Info_2_char	Product_Info_2_num
0	D3	12	D	3
1	A1	1	A	1

	Employment_Info_2	Product_Info_2_char	Product_Info_2_num
0	12	D	3
1	1	A	1

	Medical_Keyword_1	Medical_Keyword_2	Medical_Keyword_3	Medical_Keyword_4	Medical_Keyword_5	Medical_Keyword_6	Medical_Keyword_7	Medical_Keyword_8	Medical_Keyword_9	Medical_Keyword_10	Medical_Keyword_11	Medical_Keyword_12	Medical_Keyword_13	Medical_Keyword_14	Medical_Keyword_15	Medical_Keyword_16	Medical_Keyword_17	Medical_Keyword_18	Medical_Keyword_19	Medical_Keyword_20	Medical_Keyword_21	Medical_Keyword_22	Medical_Keyword_23	Medical_Keyword_24	Medical_Keyword_25	Medical_Keyword_26	Medical_Keyword_27	Medical_Keyword_28	Medical_Keyword_29	Medical_Keyword_30	Medical_Keyword_31	Medical_Keyword_32	Medical_Keyword_33	Medical_Keyword_34	Medical_Keyword_35	Medical_Keyword_36	Medical_Keyword_37	Medical_Keyword_38	Medical_Keyword_39	Medical_Keyword_40	Medical_Keyword_41	Medical_Keyword_42	Medical_Keyword_43	Medical_Keyword_44	Medical_Keyword_45	Medical_Keyword_46	Medical_Keyword_47	Medical_Keyword_48
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	Medical_Keyword_1	Medical_Keyword_2	Medical_Keyword_3	Medical_Keyword_4	Medical_Keyword_5	Medical_Keyword_6	Medical_Keyword_7	Medical_Keyword_8	Medical_Keyword_9	Medical_Keyword_10	Medical_Keyword_11	Medical_Keyword_12	Medical_Keyword_13	Medical_Keyword_14	Medical_Keyword_15	Medical_Keyword_16	Medical_Keyword_17	Medical_Keyword_18	Medical_Keyword_19	Medical_Keyword_20	Medical_Keyword_21	Medical_Keyword_22	Medical_Keyword_23	Medical_Keyword_24	Medical_Keyword_25	Medical_Keyword_26	Medical_Keyword_27	Medical_Keyword_28	Medical_Keyword_29	Medical_Keyword_30	Medical_Keyword_31	Medical_Keyword_32	Medical_Keyword_33	Medical_Keyword_34	Medical_Keyword_35	Medical_Keyword_36	Medical_Keyword_37	Medical_Keyword_38	Medical_Keyword_39	Medical_Keyword_40	Medical_Keyword_41	Medical_Keyword_42	Medical_Keyword_43	Medical_Keyword_44	Medical_Keyword_45	Medical_Keyword_46	Medical_Keyword_47	Medical_Keyword_48
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	cluster_gmix20	cluster_birch20	cluster2_BMI_vs_Ins_Age	cluster2_BMI_vs_Wt	cluster2_BMI_vs_Ht	cluster2_BMI_vs_Medical_Keyword_15	cluster2_BMI_vs_Medical_Keyword_count	cluster2_Medical_History_4_vs_Medical_History_23	cluster2_Product_Info_4_vs_InsuredInfo_6	cluster2_Age_BMI_vs_Age_Ht	cluster2_Age_BMI_vs_Age_Wt	cluster2_risk_at_least_one_low_vs_Medical_History_23	cluster2_risk_at_least_one_low_vs_Medical_History_4	cluster2_risk_at_least_one_low_vs_Medical_Keyword_15	cluster2_risk_at_least_one_low_vs_Medical_Keyword_count	cluster2_risk_at_least_one_low_vs_Product_Info_4	cluster2_risk_at_least_one_low_vs_InsuredInfo_6	cluster2_risk_at_least_one_low_vs_BMI	cluster2_risk_at_least_one_high_vs_Medical_History_23	cluster2_risk_at_least_one_high_vs_Medical_History_4	cluster2_risk_at_least_one_high_vs_Medical_Keyword_15	cluster2_risk_at_least_one_high_vs_Medical_Keyword_count	cluster2_risk_at_least_one_high_vs_Product_Info_4	cluster2_risk_at_least_one_high_vs_InsuredInfo_6	cluster2_risk_at_least_one_high_vs_BMI	cluster2_risk_extreme_vs_Medical_History_23	cluster2_risk_extreme_vs_Medical_History_4	cluster2_risk_extreme_vs_Medical_Keyword_15	cluster2_risk_extreme_vs_Medical_Keyword_count	cluster2_risk_extreme_vs_Product_Info_4	cluster2_risk_extreme_vs_InsuredInfo_6	cluster2_risk_extreme_vs_BMI
0	5	13	0	0	0	0	1	1	1	0	0	1	1	1	1	1	1	1	1	1	0	1	0	1	1	1	1	1	1	0	1	1
1	10	13	0	0	0	0	1	1	1	0	0	1	1	1	1	1	1	1	1	1	0	1	1	1	0	1	1	1	1	1	1	0

Table of Contents