In this project we will use multiclass classification to predict one of the 8 possible value of Response.
The data is taken from Kaggle Prudential Life Insurance Project.
About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.
Here category 8 has the highest counts, I assume it the quote that is granted.
Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)
Features:
1 Misc : Age ht wt bmi 4
2 Product Info : Product_Info_1 to 7 7
3 Employment Info : Employment_Info_1 to 6 6
4 Insured Info : InsuredInfo_1 to 7 7
5 Insurance History: Insurance_History_1 to 9 9
6 Family History : Family_Hist_1 to 5 5
7 Medical History : Medical_History_1 to 41 41
8 Medical Keywords : Medical_Keyword_1 to 48 48
Target: Response 1
ID : ID 1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)
import time
time_start_notebook = time.time()
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
### mount google drive
from google.colab import drive
drive.mount('/content/drive')
### load the data dir
dat_dir = 'drive/My Drive/Colab Notebooks/data/'
sys.path.append(dat_dir)
### Image dir
img_dir = 'drive/My Drive/Colab Notebooks/images/'
if not os.path.isdir(img_dir): os.makedirs(img_dir)
sys.path.append(img_dir)
### Output dir
out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
if not os.path.isdir(out_dir): os.makedirs(out_dir)
sys.path.append(out_dir)
### Also install my custom module
module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/'
sys.path.append(module_dir)
!cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
!pip install -e bhishan
!cd -
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
if ENV_COLAB:
out_dir = 'drive/My Drive/Colab Notebooks/outputs'
else:
out_dir = '../outputs'
out_dir = out_dir + '/Prudential/'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
out_dir
import numpy as np
import pandas as pd
import json
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch
SEED = 100
# useful functions
def quantile_binning(ser):
conditions = [
(ser <= ser.quantile(0.25)),
(ser > ser.quantile(0.25)) & (ser <= ser.quantile(0.75)),
(ser > ser.quantile(0.75))]
choices = ['low', 'medium', 'high']
return np.select(conditions, choices)
# when a feature is medium is medium, they may be less risky.
def risk_medium_bool(df,col):
cond = (df[col] == 'medium')
return np.where(cond,0,1)
# if one has high ==> possible denial
def risk_at_least_one(df,val):
cond = ((df['Age_cat'] == val) |
(df['Ht_cat'] == val) |
(df['Wt_cat'] == val) |
(df['BMI_cat'] == val)
)
return np.where(cond,1,0)
#================== Variables =====================================
cols_missing_high = ['Medical_History_10',
'Medical_History_32',
'Medical_History_24']
cols_discrete = ['Medical_History_1', 'Medical_History_10',
'Medical_History_15', 'Medical_History_24',
'Medical_History_32']
cols_discrete = [i for i in cols_discrete if i not in cols_missing_high]
cols_cat = ['Product_Info_1', 'Product_Info_2','Product_Info_3',
'Product_Info_5','Product_Info_6', 'Product_Info_7',
'Employment_Info_2','Employment_Info_3','Employment_Info_5',
'InsuredInfo_1','InsuredInfo_2', 'InsuredInfo_3','InsuredInfo_4',
'InsuredInfo_5','InsuredInfo_6', 'InsuredInfo_7',
'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3',
'Insurance_History_4','Insurance_History_7', 'Insurance_History_8',
'Insurance_History_9',
'Family_Hist_1',
'Medical_History_2',
'Medical_History_3','Medical_History_4', 'Medical_History_5',
'Medical_History_6', 'Medical_History_7','Medical_History_8',
'Medical_History_9', 'Medical_History_11', 'Medical_History_12',
'Medical_History_13', 'Medical_History_14', 'Medical_History_16',
'Medical_History_17', 'Medical_History_18', 'Medical_History_19',
'Medical_History_20', 'Medical_History_21','Medical_History_22',
'Medical_History_23','Medical_History_25', 'Medical_History_26',
'Medical_History_27', 'Medical_History_28', 'Medical_History_29',
'Medical_History_30', 'Medical_History_31', 'Medical_History_33',
'Medical_History_34', 'Medical_History_35', 'Medical_History_36',
'Medical_History_37', 'Medical_History_38', 'Medical_History_39',
'Medical_History_40', 'Medical_History_41']
cols_cat = [i for i in cols_cat if i not in ['Product_Info_2']]
risk_cat = ['Age_cat','Ht_cat','Wt_cat','BMI_cat']
cols_cat_all = cols_cat + cols_discrete + risk_cat + ['Product_Info_2_char']
#===================== Data cleaning =======================
def get_clean_data(df):
df = df.copy()
# create new features
df['Product_Info_2_char'] = df['Product_Info_2'].str[0]
df['Product_Info_2_num'] = df['Product_Info_2'].str[1].astype(int)
df = df.drop('Product_Info_2',axis=1)
# continuous features
df['Ins_Age_sq'] = df['Ins_Age'] * df['Ins_Age']
df['Ht_sq'] = df['Ht'] * df['Ht']
df['Wt_sq'] = df['Wt'] * df['Wt']
df['BMI_sq'] = df['BMI'] * df['BMI']
df['Ins_Age_cu'] = df['Ins_Age'] * df['Ins_Age'] * df['Ins_Age']
df['Ht_cu'] = df['Ht'] * df['Ht'] * df['Ht']
df['Wt_cu'] = df['Wt'] * df['Wt'] * df['Wt']
df['BMI_cu'] = df['BMI'] * df['BMI'] * df['BMI']
df['Age_Ht'] = df['Ins_Age'] * df['Ht']
df['Age_Wt'] = df['Ins_Age'] * df['Wt']
df['Age_BMI'] = df['Ins_Age'] * df['BMI']
# continuous variabe quantile binning
df['Age_cat'] = quantile_binning(df['Ins_Age'])
df['Ht_cat'] = quantile_binning(df['Ht'])
df['Wt_cat'] = quantile_binning(df['Wt'])
df['BMI_cat'] = quantile_binning(df['BMI'])
# aggregation statistics
cols_med_kw = df.filter(regex='Medical_Keyword').columns
df['Medical_Keyword_count'] = df[cols_med_kw].sum(axis=1)
df['Medical_Keyword_max'] = df[cols_med_kw].max(axis=1)
df['Medical_Keyword_mean'] = df[cols_med_kw].mean(axis=1)
df['Medical_Keyword_std'] = df[cols_med_kw].std(axis=1)
df['Medical_Keyword_skew'] = df[cols_med_kw].skew(axis=1)
df['Medical_Keyword_kurtosis'] = df[cols_med_kw].kurtosis(axis=1)
# risk is medium or not
df['risk_BMI_medium_bool'] = risk_medium_bool(df, 'BMI_cat')
# high risk
df['risk_at_least_one_low'] = risk_at_least_one(df, 'low') # possible risk
df['risk_at_least_one_high'] = risk_at_least_one(df, 'high') # possible risk
# extreme risk
cond = ((df['Age_cat'] == 'high') |
(df['Wt_cat'] == 'high') |
(df['BMI_cat'] == 'high'))
df['risk_extreme'] = np.where(cond,1,0)
#================== Missing values imputation =======================
df['nan_count'] = df.isna().sum(axis=1)
# remove features with >= 80% missing values
cols_missing_high = ['Medical_History_10',
'Medical_History_32',
'Medical_History_24']
df = df.drop(cols_missing_high,axis=1)
df = df.fillna(-1)
#================== Categorical Feature encoding ====================
col = 'Medical_History_1'
top_col = df[col].value_counts()[lambda x: x> 10].index
df.loc[~df[col].isin(top_col),col] = -2
col = 'Medical_History_2'
top_col = df[col].value_counts()[lambda x: x> 100].index
df.loc[~df[col].isin(top_col),col] = -2
# It is better to save data without one hot encoding
# We save features as json so that we can use OHE later.
# df = pd.get_dummies(df,columns=cols_cat_all,drop_first=True)
#================== Cluster Features ====================
target = 'Response'
cols_drop = ['Id',target]
cols_high_corr = (df.drop(cols_drop,axis=1)
.corrwith(df[target]).abs()
.sort_values(ascending=False)
.head(20)
.index.to_list()
)
df_high_corr = df[cols_high_corr]
## gaussina mixture
model = GaussianMixture(n_components=len(cols_high_corr),
random_state=SEED, reg_covar=1e-3)
model.fit(df_high_corr)
df['cluster_gmix20'] = model.predict(df_high_corr)
## Birch clustering
model = Birch(n_clusters=len(cols_high_corr))
df['cluster_birch20'] = model.fit(df_high_corr).labels_
## Two features clustering using Birch
model_birch2 = Birch(n_clusters=2)
bmi_cols = ['Ins_Age','Wt','Ht',
'Medical_Keyword_15',
'Medical_Keyword_count']
for col in bmi_cols:
dfx = df[['BMI',col]].reset_index(drop=True)
dfx.columns = ['BMI',col]
df[f'cluster2_BMI_vs_{col}'] = model_birch2.fit(dfx).labels_
cols_first = ['Medical_History_4','Product_Info_4','Age_BMI','Age_BMI']
cols_second = ['Medical_History_23','InsuredInfo_6','Age_Ht','Age_Wt']
for col1, col2 in zip(cols_first, cols_second):
dfx = df[[col1,col2]].reset_index(drop=True)
dfx.columns = [col1,col2]
df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_
cols_risk_cluster = ['risk_at_least_one_low',
'risk_at_least_one_high','risk_extreme']
cols_risk_cluster_second = ['Medical_History_23', 'Medical_History_4',
'Medical_Keyword_15','Medical_Keyword_count',
'Product_Info_4','InsuredInfo_6','BMI']
for col1 in cols_risk_cluster:
for col2 in cols_risk_cluster_second:
dfx = df[[col1,col2]].reset_index(drop=True)
dfx.columns = [col1,col2]
df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_
cols_drop = df.filter(regex='cluster').nunique().loc[lambda x: x==1].index.to_list()
df = df.drop(cols_drop,axis=1)
return df
#================ Save the clean data ========
# load the data
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
df_clean = get_clean_data(df)
# create directory
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
# save categorical features
file_features = out_dir +'categorical_features.json'
with open(file_features,'w') as fo:
json.dump(cols_cat_all,fo)
# save clean data
file_data = out_dir + 'clean_data.csv'
if os.path.isfile(file_data):
os.remove(file_data)
df_clean.to_csv(file_data,index=False,compression='zip')
pd.set_option('max_rows',None)
df_clean.head()
df = pd.read_csv(file_data,compression='zip')
df.head()
cols_cat = json.load(open(file_features))
print(cols_cat)
cols_cat[0]
df[cols_cat].head()
df.isna().sum().loc[lambda x: x >0]
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))