Description

In this project we will use multiclass classification to predict one of the 8 possible value of Response.

The data is taken from Kaggle Prudential Life Insurance Project.

About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.

Here category 8 has the highest counts, I assume it the quote that is granted.

Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)

Features:

1 Misc             : Age ht wt bmi              4
2 Product Info     : Product_Info_1 to 7        7
3 Employment Info  : Employment_Info_1 to 6     6
4 Insured Info     : InsuredInfo_1 to 7         7
5 Insurance History: Insurance_History_1 to 9   9
6 Family History   : Family_Hist_1 to 5         5
7 Medical History  : Medical_History_1 to 41    41
8 Medical Keywords : Medical_Keyword_1 to 48    48
Target: Response                                1
ID    : ID                                      1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)
In [1]:
import time
time_start_notebook = time.time()
In [2]:
%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ### mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    ### load the data dir
    dat_dir = 'drive/My Drive/Colab Notebooks/data/'
    sys.path.append(dat_dir)

    ### Image dir
    img_dir = 'drive/My Drive/Colab Notebooks/images/'
    if not os.path.isdir(img_dir): os.makedirs(img_dir)
    sys.path.append(img_dir)

    ### Output dir
    out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
    if not os.path.isdir(out_dir): os.makedirs(out_dir)
    sys.path.append(out_dir)

    ### Also install my custom module
    module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/' 
    sys.path.append(module_dir)
    !cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
    !pip install -e bhishan
    !cd -

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.
In [3]:
if ENV_COLAB:
    out_dir = 'drive/My Drive/Colab Notebooks/outputs'
else:
    out_dir = '../outputs'
    
out_dir = out_dir + '/Prudential/'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

out_dir
Out[3]:
'../outputs/Prudential/'

Script for data cleaning

In [4]:
import numpy as np
import pandas as pd
import json
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch

SEED = 100

# useful functions
def quantile_binning(ser):
    conditions = [
        (ser <= ser.quantile(0.25)),
        (ser > ser.quantile(0.25)) & (ser <= ser.quantile(0.75)),
        (ser > ser.quantile(0.75))]

    choices = ['low', 'medium', 'high']
    return np.select(conditions, choices)

# when a feature is medium is medium, they may be less risky.
def risk_medium_bool(df,col):
    cond = (df[col] == 'medium') 

    return np.where(cond,0,1)


# if one has high ==> possible denial
def risk_at_least_one(df,val):
    cond = ((df['Age_cat'] == val) | 
            (df['Ht_cat']  == val) |
            (df['Wt_cat']  == val) |
            (df['BMI_cat'] == val)
    )

    return np.where(cond,1,0)

#================== Variables =====================================
cols_missing_high = ['Medical_History_10', 
                     'Medical_History_32',
                     'Medical_History_24']

cols_discrete = ['Medical_History_1', 'Medical_History_10',
                     'Medical_History_15', 'Medical_History_24',
                     'Medical_History_32']

cols_discrete = [i for i in cols_discrete if i not in cols_missing_high]

cols_cat = ['Product_Info_1', 'Product_Info_2','Product_Info_3',
    'Product_Info_5','Product_Info_6', 'Product_Info_7',
    'Employment_Info_2','Employment_Info_3','Employment_Info_5',
    'InsuredInfo_1','InsuredInfo_2', 'InsuredInfo_3','InsuredInfo_4',
    'InsuredInfo_5','InsuredInfo_6', 'InsuredInfo_7', 
    'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3',
    'Insurance_History_4','Insurance_History_7', 'Insurance_History_8',
    'Insurance_History_9',
    'Family_Hist_1',
    'Medical_History_2',
    'Medical_History_3','Medical_History_4', 'Medical_History_5', 
    'Medical_History_6', 'Medical_History_7','Medical_History_8',
    'Medical_History_9', 'Medical_History_11', 'Medical_History_12',
    'Medical_History_13', 'Medical_History_14', 'Medical_History_16',
    'Medical_History_17', 'Medical_History_18', 'Medical_History_19',
    'Medical_History_20', 'Medical_History_21','Medical_History_22',
    'Medical_History_23','Medical_History_25', 'Medical_History_26', 
    'Medical_History_27', 'Medical_History_28', 'Medical_History_29',
    'Medical_History_30', 'Medical_History_31', 'Medical_History_33',
    'Medical_History_34', 'Medical_History_35', 'Medical_History_36',
    'Medical_History_37', 'Medical_History_38', 'Medical_History_39', 
    'Medical_History_40', 'Medical_History_41']

cols_cat = [i for i in cols_cat if i not in ['Product_Info_2']]
risk_cat = ['Age_cat','Ht_cat','Wt_cat','BMI_cat']
cols_cat_all = cols_cat + cols_discrete + risk_cat + ['Product_Info_2_char']

#===================== Data cleaning =======================
def get_clean_data(df):
    df = df.copy()
    # create new features
    df['Product_Info_2_char'] = df['Product_Info_2'].str[0]
    df['Product_Info_2_num'] = df['Product_Info_2'].str[1].astype(int)
    df = df.drop('Product_Info_2',axis=1)

    # continuous features
    df['Ins_Age_sq'] = df['Ins_Age'] * df['Ins_Age']
    df['Ht_sq'] = df['Ht'] * df['Ht']
    df['Wt_sq'] = df['Wt'] * df['Wt']
    df['BMI_sq'] = df['BMI'] * df['BMI']
    df['Ins_Age_cu'] = df['Ins_Age'] * df['Ins_Age'] * df['Ins_Age']
    df['Ht_cu'] = df['Ht'] * df['Ht'] * df['Ht']
    df['Wt_cu'] = df['Wt'] * df['Wt'] * df['Wt']
    df['BMI_cu'] = df['BMI'] * df['BMI'] * df['BMI']
    df['Age_Ht'] = df['Ins_Age'] * df['Ht']
    df['Age_Wt'] = df['Ins_Age'] * df['Wt']
    df['Age_BMI'] = df['Ins_Age'] * df['BMI']

    # continuous variabe quantile binning
    df['Age_cat'] = quantile_binning(df['Ins_Age'])
    df['Ht_cat'] = quantile_binning(df['Ht'])
    df['Wt_cat'] = quantile_binning(df['Wt'])
    df['BMI_cat'] = quantile_binning(df['BMI'])

    # aggregation statistics
    cols_med_kw = df.filter(regex='Medical_Keyword').columns
    df['Medical_Keyword_count'] = df[cols_med_kw].sum(axis=1)
    df['Medical_Keyword_max'] = df[cols_med_kw].max(axis=1)
    df['Medical_Keyword_mean'] = df[cols_med_kw].mean(axis=1)
    df['Medical_Keyword_std'] = df[cols_med_kw].std(axis=1)
    df['Medical_Keyword_skew'] = df[cols_med_kw].skew(axis=1)
    df['Medical_Keyword_kurtosis'] = df[cols_med_kw].kurtosis(axis=1)


    # risk is medium or not
    df['risk_BMI_medium_bool'] = risk_medium_bool(df, 'BMI_cat')

    # high risk
    df['risk_at_least_one_low'] = risk_at_least_one(df, 'low') # possible risk
    df['risk_at_least_one_high'] = risk_at_least_one(df, 'high') # possible risk

    # extreme risk
    cond = ((df['Age_cat'] == 'high') |
            (df['Wt_cat']  == 'high') |
            (df['BMI_cat'] == 'high'))
    df['risk_extreme'] = np.where(cond,1,0)

    #================== Missing values imputation =======================
    df['nan_count'] = df.isna().sum(axis=1)

    # remove features with >= 80% missing values
    cols_missing_high = ['Medical_History_10', 
                        'Medical_History_32',
                        'Medical_History_24']
    df = df.drop(cols_missing_high,axis=1)
    df = df.fillna(-1)

    #================== Categorical Feature encoding ====================
    col = 'Medical_History_1'
    top_col = df[col].value_counts()[lambda x: x> 10].index
    df.loc[~df[col].isin(top_col),col] = -2

    col = 'Medical_History_2'
    top_col = df[col].value_counts()[lambda x: x> 100].index
    df.loc[~df[col].isin(top_col),col] = -2

    # It is better to save data without one hot encoding 
    # We save features as json so that we can use OHE later.
    # df = pd.get_dummies(df,columns=cols_cat_all,drop_first=True)

    #================== Cluster Features ====================
    target = 'Response'
    cols_drop = ['Id',target]
    cols_high_corr = (df.drop(cols_drop,axis=1)
                        .corrwith(df[target]).abs()
                        .sort_values(ascending=False)
                        .head(20)
                        .index.to_list()
                        )
    df_high_corr = df[cols_high_corr]

    ## gaussina mixture
    model = GaussianMixture(n_components=len(cols_high_corr),
                            random_state=SEED, reg_covar=1e-3)
    model.fit(df_high_corr)
    df['cluster_gmix20'] = model.predict(df_high_corr)

    ## Birch clustering
    model = Birch(n_clusters=len(cols_high_corr))
    df['cluster_birch20'] = model.fit(df_high_corr).labels_

    ## Two features clustering using Birch
    model_birch2 = Birch(n_clusters=2)

    bmi_cols = ['Ins_Age','Wt','Ht',
            'Medical_Keyword_15',
            'Medical_Keyword_count']

    for col in bmi_cols:
        dfx = df[['BMI',col]].reset_index(drop=True)
        dfx.columns = ['BMI',col]
        df[f'cluster2_BMI_vs_{col}'] = model_birch2.fit(dfx).labels_

    cols_first = ['Medical_History_4','Product_Info_4','Age_BMI','Age_BMI']
    cols_second = ['Medical_History_23','InsuredInfo_6','Age_Ht','Age_Wt']

    for col1, col2 in zip(cols_first, cols_second):
        dfx = df[[col1,col2]].reset_index(drop=True)
        dfx.columns = [col1,col2]

        df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_

    cols_risk_cluster = ['risk_at_least_one_low',
                         'risk_at_least_one_high','risk_extreme']
    cols_risk_cluster_second = ['Medical_History_23', 'Medical_History_4',
                        'Medical_Keyword_15','Medical_Keyword_count',
                        'Product_Info_4','InsuredInfo_6','BMI']

    for col1 in cols_risk_cluster:
        for col2 in cols_risk_cluster_second:
            dfx = df[[col1,col2]].reset_index(drop=True)
            dfx.columns = [col1,col2]
            df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_

    cols_drop = df.filter(regex='cluster').nunique().loc[lambda x: x==1].index.to_list()
    df = df.drop(cols_drop,axis=1)

    return df

#================ Save the clean data ========
# load the data
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
df_clean = get_clean_data(df)


# create directory
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

# save categorical features
file_features = out_dir +'categorical_features.json'
with open(file_features,'w') as fo:
    json.dump(cols_cat_all,fo)


# save clean data
file_data = out_dir + 'clean_data.csv'
if os.path.isfile(file_data):
    os.remove(file_data)
df_clean.to_csv(file_data,index=False,compression='zip')
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning: Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.
  % (len(centroids), self.n_clusters), ConvergenceWarning)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning: Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.
  % (len(centroids), self.n_clusters), ConvergenceWarning)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning: Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.
  % (len(centroids), self.n_clusters), ConvergenceWarning)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning: Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.
  % (len(centroids), self.n_clusters), ConvergenceWarning)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning: Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.
  % (len(centroids), self.n_clusters), ConvergenceWarning)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/cluster/_birch.py:649: ConvergenceWarning: Number of subclusters found (1) by Birch is less than (2). Decrease the threshold.
  % (len(centroids), self.n_clusters), ConvergenceWarning)

Get data from saved path

In [5]:
pd.set_option('max_rows',None)
In [6]:
df_clean.head()
Out[6]:
Id Product_Info_1 Product_Info_3 Product_Info_4 Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht Wt ... cluster2_risk_at_least_one_high_vs_Product_Info_4 cluster2_risk_at_least_one_high_vs_InsuredInfo_6 cluster2_risk_at_least_one_high_vs_BMI cluster2_risk_extreme_vs_Medical_History_23 cluster2_risk_extreme_vs_Medical_History_4 cluster2_risk_extreme_vs_Medical_Keyword_15 cluster2_risk_extreme_vs_Medical_Keyword_count cluster2_risk_extreme_vs_Product_Info_4 cluster2_risk_extreme_vs_InsuredInfo_6 cluster2_risk_extreme_vs_BMI
0 2 1 10 0.076923 2 1 1 0.641791 0.581818 0.148536 ... 0 1 1 1 1 1 1 0 1 1
1 5 1 26 0.076923 2 3 1 0.059701 0.600000 0.131799 ... 1 1 0 1 1 1 1 1 1 0
2 6 1 26 0.076923 2 3 1 0.029851 0.745455 0.288703 ... 1 0 0 1 0 1 1 1 0 0
3 7 1 10 0.487179 2 3 1 0.164179 0.672727 0.205021 ... 1 1 0 1 0 1 1 1 1 0
4 8 1 26 0.230769 2 3 1 0.417910 0.654545 0.234310 ... 1 1 0 1 0 1 1 1 1 0

5 rows × 178 columns

In [7]:
df = pd.read_csv(file_data,compression='zip')
df.head()
Out[7]:
Id Product_Info_1 Product_Info_3 Product_Info_4 Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht Wt ... cluster2_risk_at_least_one_high_vs_Product_Info_4 cluster2_risk_at_least_one_high_vs_InsuredInfo_6 cluster2_risk_at_least_one_high_vs_BMI cluster2_risk_extreme_vs_Medical_History_23 cluster2_risk_extreme_vs_Medical_History_4 cluster2_risk_extreme_vs_Medical_Keyword_15 cluster2_risk_extreme_vs_Medical_Keyword_count cluster2_risk_extreme_vs_Product_Info_4 cluster2_risk_extreme_vs_InsuredInfo_6 cluster2_risk_extreme_vs_BMI
0 2 1 10 0.076923 2 1 1 0.641791 0.581818 0.148536 ... 0 1 1 1 1 1 1 0 1 1
1 5 1 26 0.076923 2 3 1 0.059701 0.600000 0.131799 ... 1 1 0 1 1 1 1 1 1 0
2 6 1 26 0.076923 2 3 1 0.029851 0.745455 0.288703 ... 1 0 0 1 0 1 1 1 0 0
3 7 1 10 0.487179 2 3 1 0.164179 0.672727 0.205021 ... 1 1 0 1 0 1 1 1 1 0
4 8 1 26 0.230769 2 3 1 0.417910 0.654545 0.234310 ... 1 1 0 1 0 1 1 1 1 0

5 rows × 178 columns

In [8]:
cols_cat = json.load(open(file_features))

print(cols_cat)
['Product_Info_1', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'Family_Hist_1', 'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7', 'Medical_History_8', 'Medical_History_9', 'Medical_History_11', 'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_16', 'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 'Medical_History_30', 'Medical_History_31', 'Medical_History_33', 'Medical_History_34', 'Medical_History_35', 'Medical_History_36', 'Medical_History_37', 'Medical_History_38', 'Medical_History_39', 'Medical_History_40', 'Medical_History_41', 'Medical_History_1', 'Medical_History_15', 'Age_cat', 'Ht_cat', 'Wt_cat', 'BMI_cat', 'Product_Info_2_char']
In [9]:
cols_cat[0]
Out[9]:
'Product_Info_1'
In [10]:
df[cols_cat].head()
Out[10]:
Product_Info_1 Product_Info_3 Product_Info_5 Product_Info_6 Product_Info_7 Employment_Info_2 Employment_Info_3 Employment_Info_5 InsuredInfo_1 InsuredInfo_2 ... Medical_History_39 Medical_History_40 Medical_History_41 Medical_History_1 Medical_History_15 Age_cat Ht_cat Wt_cat BMI_cat Product_Info_2_char
0 1 10 2 1 1 12 1 3 1 2 ... 3 3 3 4.0 240.0 high low low low D
1 1 26 2 3 1 1 3 2 1 2 ... 3 3 1 5.0 0.0 low low low low A
2 1 26 2 3 1 9 1 2 1 2 ... 3 3 1 10.0 -1.0 low medium medium medium E
3 1 10 2 3 1 9 1 3 2 2 ... 3 3 1 0.0 -1.0 low medium low low D
4 1 26 2 3 1 9 1 2 1 2 ... 3 3 1 -1.0 -1.0 medium low medium medium D

5 rows × 66 columns

In [11]:
df.isna().sum().loc[lambda x: x >0]
Out[11]:
Series([], dtype: int64)

Time Taken

In [12]:
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 1 min 3 secs
In [ ]: