In this project we will use multiclass classification to predict one of the 8 possible value of Response.
The data is taken from Kaggle Prudential Life Insurance Project.
About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.
Here category 8 has the highest counts, I assume it the quote that is granted.
Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)
Features:
1 Misc : Age ht wt bmi 4
2 Product Info : Product_Info_1 to 7 7
3 Employment Info : Employment_Info_1 to 6 6
4 Insured Info : InsuredInfo_1 to 7 7
5 Insurance History: Insurance_History_1 to 9 9
6 Family History : Family_Hist_1 to 5 5
7 Medical History : Medical_History_1 to 41 41
8 Medical Keywords : Medical_Keyword_1 to 48 48
Target: Response 1
ID : ID 1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True,font_scale=1.5)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# random state
SEED = 100
time_start_notebook = time.time()
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import scipy
from scipy import stats
import IPython
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
# Google colab
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
### mount google drive
from google.colab import drive
drive.mount('/content/drive')
### load the data dir
dat_dir = 'drive/My Drive/Colab Notebooks/data/'
sys.path.append(dat_dir)
### Image dir
img_dir = 'drive/My Drive/Colab Notebooks/images/'
if not os.path.isdir(img_dir): os.makedirs(img_dir)
sys.path.append(img_dir)
### Output dir
out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
if not os.path.isdir(out_dir): os.makedirs(out_dir)
sys.path.append(out_dir)
### Also install my custom module
module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/'
sys.path.append(module_dir)
!cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
!pip install -e bhishan
!cd -
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
if ENV_COLAB:
out_dir = 'drive/My Drive/Colab Notebooks/outputs'
else:
out_dir = '../outputs'
out_dir = out_dir + '/Prudential/'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
out_dir
import gc
from tqdm import tqdm
import functools # funtools.lru_cache
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch
import bhishan
from bhishan import bp
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999
%load_ext autoreload
%autoreload 2
%load_ext watermark
%watermark -a "Bhishan Poudel" -dvm
%watermark -iv
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()
target = 'Response'
from sklearn.model_selection import train_test_split
target = 'Response'
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1), df[target],
test_size=0.2, random_state=SEED, stratify=df[target])
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig, ser_ytrain_orig,
test_size=0.2, random_state=SEED, stratify=ser_ytrain_orig)
print(f"df : {df.shape}")
print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")
print(f"\ndf_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}")
print(f"\ndf_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}")
print(f"\ndf_Xtest : {df_Xtest.shape}")
print(f"ser_ytest : {ser_ytest.shape}")
df_Xtrain_orig.head(2)
df.bp.plot_cat(target)
df.bp.plot_pareto(target)
"""
We can see class 8 has the highest distribution (33%), we can assume this as
the clean and accepted policy of insurance.
Other classes 1-7 can be assumed rejected or accepted with some conditions.
""";
df_desc = df.bp.get_column_descriptions(transpose=True)
df_desc
Here our target has 8 classes, looking the class distribution we see that class 8 has the highest counts, we can treat this as 1 and all others as 0.
cond = df[target] == 8
df['Response8'] = np.where(cond,1,0)
df.filter(regex='Resp').head()
df.shape
df.select_dtypes('number').shape
df.select_dtypes('object').head()
"""
One of the feature Product_Info_2 is object.
Create new features from this.
""";
df['Product_Info_2_char'] = df['Product_Info_2'].str[0]
df['Product_Info_2_num'] = df['Product_Info_2'].str[1].astype(int)
df.Product_Info_2_num.isna().sum() # it's good we dont have nans.
df.filter(regex='_Info_2').head(2)
df = df.drop('Product_Info_2',axis=1)
df.filter(regex='_Info_2').head(2)
"""
Some important continuous features:
- age bmi ht wt
"""
df['Ins_Age_sq'] = df['Ins_Age'] * df['Ins_Age']
df['Ht_sq'] = df['Ht'] * df['Ht']
df['Wt_sq'] = df['Wt'] * df['Wt']
df['BMI_sq'] = df['BMI'] * df['BMI']
df['Ins_Age_cu'] = df['Ins_Age'] * df['Ins_Age'] * df['Ins_Age']
df['Ht_cu'] = df['Ht'] * df['Ht'] * df['Ht']
df['Wt_cu'] = df['Wt'] * df['Wt'] * df['Wt']
df['BMI_cu'] = df['BMI'] * df['BMI'] * df['BMI']
df['Age_Ht'] = df['Ins_Age'] * df['Ht']
df['Age_Wt'] = df['Ins_Age'] * df['Wt']
df['Age_BMI'] = df['Ins_Age'] * df['BMI']
df.iloc[:2,-10:]
def quantile_binning(ser):
conditions = [
(ser <= ser.quantile(0.25)),
(ser > ser.quantile(0.25)) & (ser <= ser.quantile(0.75)),
(ser > ser.quantile(0.75))]
choices = ['low', 'medium', 'high']
return np.select(conditions, choices)
df['Age_cat'] = quantile_binning(df['Ins_Age'])
df['Ht_cat'] = quantile_binning(df['Ht'])
df['Wt_cat'] = quantile_binning(df['Wt'])
df['BMI_cat'] = quantile_binning(df['BMI'])
quantile_cat = df.filter(regex='_cat$').columns.to_list()
print(quantile_cat)
df.filter(regex='_cat$').head(2)
df_resp8_0 = df.loc[df['Response8']==0]
df_resp8_1 = df.loc[df['Response8']==1]
mapping = {'low':0, 'medium':1,'high':2}
target8 = "Response8"
plt.figure()
fig, axes = plt.subplots(2,2,figsize=(24,18))
for i,col in enumerate(quantile_cat):
i+=1
plt.subplot(2,2,i)
x0 = df_resp8_0[col].replace(mapping).to_numpy()
x1 = df_resp8_1[col].replace(mapping).to_numpy()
a = x0 / np.dot(x0,x0)
b = x1 / np.dot(x1,x1)
# overlap = np.dot(a,b)
sns.kdeplot(x0, bw=0.5,label=f"{target8} = 0",shade=1)
sns.kdeplot(x1, bw=0.5,label=f"{target8} = 1",shade=1)
plt.xlabel(col, fontsize=18)
plt.legend(loc='upper right',fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
sns.scatterplot(data=df,x='BMI',y='Wt',hue='Response8',alpha=1)
# when bmi is high and wt is high, applicant is most likely rejected.
# df.bp.plot_cat_cat('Age_cat','Response8')
# df.bp.plot_cat_cat('Ht_cat','Response8')
# df.bp.plot_cat_cat('Wt_cat','Response8')
df.bp.plot_cat_cat('BMI_cat','Response8')
"""
Look at the top right figure.
Age: when age is high/medium ==> more rejection
Ht : when height_cat is not useful. for all category of ht, distribution is different.
Wt : Wt==high is important feature (almost all rejected)
BMI: bmi==high is important feature (almost all rejected)
""";
cat = 'BMI_cat'
df1 = df[[cat,'Response8']]
df1 = df1[df1[cat]=='high']
df1['Response8'].value_counts(normalize=True)
pd.crosstab(df['BMI_cat'],df['Response8'],normalize='index')
df.filter(regex='_cat$').head(2)
# when a feature is medium is medium, they may be less risky.
def risk_medium_bool(key):
cond = (df[key] == 'medium')
return np.where(cond,0,1)
df['risk_Age_medium_bool'] = risk_medium_bool('Age_cat')
df['risk_Ht_medium_bool'] = risk_medium_bool('Ht_cat')
df['risk_Wt_medium_bool'] = risk_medium_bool('Wt_cat')
df['risk_BMI_medium_bool'] = risk_medium_bool('BMI_cat')
def risk_at_least_one(key):
cond = ((df['Age_cat'] == key) |
(df['Ht_cat'] == key) |
(df['Wt_cat'] == key) |
(df['BMI_cat'] == key))
return np.where(cond,1,0)
df['risk_at_least_one_low'] = risk_at_least_one('low') # possible risk
df['risk_at_least_one_high'] = risk_at_least_one('high') # possible risk
cond = ((df['Age_cat'] == 'high') |
(df['Wt_cat'] == 'high') |
(df['BMI_cat'] == 'high'))
df['risk_extreme'] = np.where(cond,1,0)
df.filter(regex='_cat$|^risk_').head()
cols_risk_bool = df.filter(regex='^risk_.*_bool$').columns.to_list()
cols_risk_bool
df_resp8_0 = df.loc[df['Response8']==0]
df_resp8_1 = df.loc[df['Response8']==1]
target8 = "Response8"
plt.figure()
fig, axes = plt.subplots(2,2,figsize=(24,18))
for i,col in enumerate(cols_risk_bool):
i+=1
plt.subplot(2,2,i)
x0 = df_resp8_0[col]
x1 = df_resp8_1[col]
sns.kdeplot(x0, bw=0.5,label=f"{target8} = 0",shade=1)
sns.kdeplot(x1, bw=0.5,label=f"{target8} = 1",shade=1)
plt.xlabel(col, fontsize=18)
plt.legend(loc='upper right',fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
# drop unwanted columns
# risk_Age_medium_bool and risk_Wt_medium_bool are useless
df = df.drop(['risk_Age_medium_bool','risk_Ht_medium_bool', 'risk_Wt_medium_bool'],axis=1)
df.filter(regex='Med').head(2)
df.filter(regex='Medical_Key').head(2)
"""
There are 48 medical keyword features.
We will create new features based on few statistics such as min, max, mean, count
""";
cols_med_kw = df.filter(regex='Medical_Keyword').columns
cols_med_kw
df['Medical_Keyword_count'] = df[cols_med_kw].sum(axis=1)
df['Medical_Keyword_min'] = df[cols_med_kw].min(axis=1)
df['Medical_Keyword_max'] = df[cols_med_kw].max(axis=1)
df['Medical_Keyword_mean'] = df[cols_med_kw].mean(axis=1)
df['Medical_Keyword_std'] = df[cols_med_kw].std(axis=1)
df['Medical_Keyword_skew'] = df[cols_med_kw].skew(axis=1)
df['Medical_Keyword_kurtosis'] = df[cols_med_kw].kurtosis(axis=1)
df['Medical_Keyword_median'] = df[cols_med_kw].median(axis=1)
cols_med_kw_agg = df.filter(regex='^Medical_Keyword_\D').columns.to_list()
cols_med_kw_agg
df[cols_med_kw_agg].describe()
df[cols_med_kw_agg].nunique()
# drop features that have only one value
cols_drop = df[cols_med_kw_agg].nunique().loc[lambda x: x==1].index.to_list()
cols_drop
df = df.drop(cols_drop, axis=1)
There is high risk of data leakage using target based features, however, we can always use the approach of try and see.
df[cols_med_kw].head(2)
# df[cols_med_kw].apply(lambda x: x.nunique())
# all these features are binary.
# df['target_min_med_kw'] = df.groupby(cols_med_kw.to_list())[target].transform('min')
# df['target_mean_med_kw'] = df.groupby(cols_med_kw.to_list())[target].transform('mean')
# i don't like target based features.
df.isna().sum().sum()
df['nan_count'] = df.isna().sum(axis=1)
df.iloc[:2,-2:]
df.bp.missing()
# remove features with >= 80% missing values
cols_missing_high = ['Medical_History_10',
'Medical_History_32',
'Medical_History_24']
df = df.drop(cols_missing_high,axis=1)
cols_missing_low = ['Medical_History_15', 'Family_Hist_5',
'Family_Hist_3', 'Family_Hist_2',
'Insurance_History_5', 'Family_Hist_4',
'Employment_Info_6', 'Medical_History_1',
'Employment_Info_4', 'Employment_Info_1']
df = df.fillna(-1)
df.isna().sum().sum()
The following variables are all categorical (nominal):
Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41
The following variables are continuous:
Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5
The following variables are discrete:
Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32
Medical_Keyword_1-48 are dummy variables.
Quantitative variables can be classified as discrete or continuous.
Categorical variable definition
Categorical variables contain a finite number of categories or distinct groups. Categorical data might not have a logical order. For example, categorical predictors include gender, material type, and payment method.
Discrete variable definition
Discrete variables are numeric variables that have a countable number of values between any two values. A discrete variable is always numeric. For example, the number of customer complaints or the number of flaws or defects.
# df.dtypes.loc[lambda x: x=='object']
df.select_dtypes('object').head(2)
df['Product_Info_2_char'].nunique() # for small number we can use OHE
cols_discrete = ['Medical_History_1', 'Medical_History_10',
'Medical_History_15', 'Medical_History_24',
'Medical_History_32']
cols_discrete = [i for i in cols_discrete if i not in cols_missing_high]
df[cols_discrete].head()
df[cols_discrete].nunique()
# df_copy = df.copy()
col = 'Medical_History_1'
# df[col].value_counts()
col = 'Medical_History_1'
top_col = df[col].value_counts()[lambda x: x> 10].index
df.loc[~df[col].isin(top_col),col] = -2
col = 'Medical_History_15'
# df[col].value_counts().head(200).reset_index()
# the value count is steadily decreasing, there is no obvious cut-off point.
cols_cat = ['Product_Info_1', 'Product_Info_2',
'Product_Info_3', 'Product_Info_5',
'Product_Info_6', 'Product_Info_7',
'Employment_Info_2', 'Employment_Info_3',
'Employment_Info_5', 'InsuredInfo_1',
'InsuredInfo_2', 'InsuredInfo_3',
'InsuredInfo_4', 'InsuredInfo_5',
'InsuredInfo_6', 'InsuredInfo_7',
'Insurance_History_1', 'Insurance_History_2',
'Insurance_History_3', 'Insurance_History_4',
'Insurance_History_7', 'Insurance_History_8',
'Insurance_History_9', 'Family_Hist_1',
'Medical_History_2', 'Medical_History_3',
'Medical_History_4', 'Medical_History_5',
'Medical_History_6', 'Medical_History_7',
'Medical_History_8', 'Medical_History_9',
'Medical_History_11', 'Medical_History_12',
'Medical_History_13', 'Medical_History_14',
'Medical_History_16', 'Medical_History_17',
'Medical_History_18', 'Medical_History_19',
'Medical_History_20', 'Medical_History_21',
'Medical_History_22', 'Medical_History_23',
'Medical_History_25', 'Medical_History_26',
'Medical_History_27', 'Medical_History_28',
'Medical_History_29', 'Medical_History_30',
'Medical_History_31', 'Medical_History_33',
'Medical_History_34', 'Medical_History_35',
'Medical_History_36', 'Medical_History_37',
'Medical_History_38', 'Medical_History_39',
'Medical_History_40', 'Medical_History_41']
cols_cat = [i for i in cols_cat if i not in ['Product_Info_2']]
# df[cols_cat].nunique()
df[cols_cat].nunique().loc[lambda x: x>10]
# df['Medical_History_2'].value_counts().head(200)
col = 'Medical_History_2'
top_col = df[col].value_counts()[lambda x: x> 100].index
df.loc[~df[col].isin(top_col),col] = -2
# df.nunique().loc[lambda x: x>90]
cols = df.nunique().loc[lambda x: x>90].index.to_list()
# df[cols].dtypes
df[cols].head(2)
risk_cat = ['Age_cat', 'Ht_cat', 'Wt_cat', 'BMI_cat']
cols_cat_all = cols_cat + cols_discrete + risk_cat + ['Product_Info_2_char']
print(f"Number of categorical features: {len(cols_cat_all)}")
# df[cols_cat].nunique()
print(f"Shape of df before encoding: {df.shape}")
df_encoded = pd.get_dummies(df,columns=cols_cat_all,drop_first=True)
print(f"Shape of df after encoding: {df_encoded.shape}")
df_encoded.select_dtypes('object').shape
df_encoded.sum().sum() # this needs to a number
Here we already have normalized numerical features. I will not log transform and normalize the numerical features.
df.filter(regex='Resp').head(2)
df.head(2)
cols_drop = ['Id','Response','Response8']
(df.drop(cols_drop,axis=1)
.corrwith(df[target8]).abs()
.to_frame()
.sort_values([0],ascending=False).T)
cols_drop = ['Id','Response','Response8']
cols_high_corr = (df.drop(cols_drop,axis=1)
.corrwith(df[target8]).abs()
.sort_values(ascending=False)
.head(20)
.index.to_list()
)
print(cols_high_corr)
df_high_corr = df[cols_high_corr]
df_high_corr.head(2)
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=len(cols_high_corr), random_state=SEED, reg_covar=1e-3)
model.fit(df_high_corr)
df['cluster_gmix20'] = model.predict(df_high_corr)
df.filter(regex='cluster').head(2)
gc.collect()
from sklearn.cluster import Birch
model = Birch(n_clusters=len(cols_high_corr))
model
df['cluster_birch20'] = model.fit(df_high_corr).labels_
df.filter(regex='cluster').head(2)
gc.collect()
model_birch2 = Birch(n_clusters=2)
model_birch2
bmi_cols = ['Ins_Age','Wt','Ht',
'Medical_Keyword_15',
'Medical_Keyword_count']
for col in bmi_cols:
dfx = df[['BMI',col]].reset_index(drop=True)
dfx.columns = ['BMI',col]
df[f'cluster2_BMI_vs_{col}'] = model_birch2.fit(dfx).labels_
cols_first = ['Medical_History_4','Product_Info_4','Age_BMI','Age_BMI']
cols_second = ['Medical_History_23','InsuredInfo_6','Age_Ht','Age_Wt']
for col1, col2 in zip(cols_first, cols_second):
dfx = df[[col1,col2]].reset_index(drop=True)
dfx.columns = [col1,col2]
df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_
cols_risk_cluster = ['risk_at_least_one_low',
'risk_at_least_one_high',
'risk_extreme']
cols_risk_cluster_second = ['Medical_History_23', 'Medical_History_4',
'Medical_Keyword_15','Medical_Keyword_count',
'Product_Info_4','InsuredInfo_6','BMI']
for col1 in cols_risk_cluster:
for col2 in cols_risk_cluster_second:
dfx = df[[col1,col2]].reset_index(drop=True)
dfx.columns = [col1,col2]
df[f'cluster2_{col1}_vs_{col2}'] = model_birch2.fit(dfx).labels_
df.filter(regex='cluster').head(2)
# check if the feature has only one value
df.filter(regex='cluster').nunique().loc[lambda x: x==1]
cols_drop = df.filter(regex='cluster').nunique().loc[lambda x: x==1].index.to_list()
print(cols_drop)
df = df.drop(cols_drop,axis=1)
df.nunique().loc[lambda x: x==1]
import json
df.shape
def get_high_correlated_features(df, thr=0.95):
cols_corr = set()
df_corr = df.corr()
for i in range(len(df_corr.columns)):
for j in range(i):
if ( abs(df_corr.iloc[i, j]) >= thr) and (df_corr.columns[j] not in cols_corr):
colname = df_corr.columns[i]
cols_corr.add(colname)
return cols_corr
cols_high_corr = get_high_correlated_features(df,thr=0.95)
print(cols_high_corr)
print(len(cols_high_corr))
out_json = out_dir + 'correlated_features.json'
print(out_json)
with open(out_json,'w') as fo:
json.dump(out_json,fo)
df = df.drop(cols_high_corr,axis=1)
df.shape
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))