In this dataset, you are provided over a hundred variables describing attributes of life insurance applicants. The task is to predict the "Response" variable for each Id in the test set. "Response" is an ordinal measure of risk that has 8 levels.
Variable | Description |
Id | A unique identifier associated with an application. |
Product_Info_1-7 | A set of normalized variables relating to the product applied for |
Ins_Age | Normalized age of applicant |
Ht | Normalized height of applicant |
Wt | Normalized weight of applicant |
BMI | Normalized BMI of applicant |
Employment_Info_1-6 | A set of normalized variables relating to the employment history of the applicant. |
InsuredInfo_1-6 | A set of normalized variables providing information about the applicant. |
Insurance_History_1-9 | A set of normalized variables relating to the insurance history of the applicant. |
Family_Hist_1-5 | A set of normalized variables relating to the family history of the applicant. |
Medical_History_1-41 | A set of normalized variables relating to the medical history of the applicant. |
Medical_Keyword_1-48 | A set of dummy variables relating to the presence of/absence of a medical keyword being associated with the application. |
Response | This is the target variable, an ordinal variable relating to the final decision associated with an application |
The following variables are all categorical (nominal):
Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41
The following variables are continuous:
Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5
The following variables are discrete:
Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32
Medical_Keyword_1-48 are dummy variables.
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True,font_scale=1.5)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# random state
SEED = 100
time_start_notebook = time.time()
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import scipy
from scipy import stats
import IPython
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
# Google colab
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
### mount google drive
from google.colab import drive
drive.mount('/content/drive')
### load the data dir
dat_dir = 'drive/My Drive/Colab Notebooks/data/'
sys.path.append(dat_dir)
### Image dir
img_dir = 'drive/My Drive/Colab Notebooks/images/'
if not os.path.isdir(img_dir): os.makedirs(img_dir)
sys.path.append(img_dir)
### Output dir
out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
if not os.path.isdir(out_dir): os.makedirs(out_dir)
sys.path.append(out_dir)
### Also install my custom module
module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/'
sys.path.append(module_dir)
!cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
!pip install -e bhishan
!cd -
# pandas profiling
!pip install -U pandas-profiling # we need restart
import pandas_profiling
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import sklearn
import pandas_profiling
import bhishan
from bhishan import bp
print(bhishan.__version__)
%load_ext watermark
%watermark -a "Bhishan Poudel" -dvm
%watermark -iv
%load_ext autoreload
%autoreload 2
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()
target = 'Response'
df_misc = df[['Ins_Age','Ht','Wt','BMI']]
print(df_misc.shape)
df_misc.head()
df_misc.bp.get_column_descriptions()
df_misc.bp.missing()
df_misc.bp.plot_corr()
sns.scatterplot(x = 'Ins_Age', y = 'BMI',hue=target, data = df);
df_misc.merge(df[target],left_index=True,right_index=True).bp.plot_corr()
df_product = df[df.columns[df.columns.str.startswith('Product')]]
print(df_product.shape)
df_product.head()
df_product.bp.get_column_descriptions(style=True)
df_product.bp.missing()
df_product.merge(df[target],left_index=True,right_index=True).bp.plot_corr(xrot=75)
df_employement = df[df.columns[df.columns.str.startswith('Employment')]]
print(df_employement.shape)
df_employement.head()
df_employement.bp.get_column_descriptions()
df_employement.bp.missing()
df_employement.merge(df[target],left_index=True,right_index=True).bp.plot_corr(xrot=75)
df_insurance = df[df.columns[df.columns.str.startswith('Insurance')]]
print(df_insurance.shape)
df_insurance.head()
df_insurance.bp.get_column_descriptions()
df_insurance.bp.missing()
df_insurance.merge(df[target],left_index=True,right_index=True).bp.plot_corr(xrot=75)
df_family = df[df.columns[df.columns.str.startswith('Family')]]
print(df_family.shape)
df_family.head()
df_family.bp.get_column_descriptions()
df_family.bp.missing()
df_family.merge(df[target],left_index=True,right_index=True).bp.plot_corr(xrot=75)
df_medical = df[df.columns[df.columns.str.startswith('Medical')]]
print(df_medical.shape)
df_medical.head()
# df_medical.bp.get_column_descriptions()
df_medical.bp.missing()
df_medical.merge(df[target],left_index=True,right_index=True).bp.plotly_corr(target)
df_medical.merge(df[target],left_index=True,right_index=True
).bp.plot_corr_sns(target=target)
sns.catplot(x='Response' , kind='count' , data=df)
df.bp.plot_cat('Response')
df.bp.plot_pareto('Response')
We can look at the overview of the data such as histogram, missing values, correlation, skewness using pandas_profiling module. Bear in mind that it may take long time to produce results. So saving them in a output file and only producing them once is a good way to go.
import pandas_profiling
if ENV_COLAB:
out_dir = 'drive/My Drive/Colab Notebooks/outputs'
else:
out_dir = '../outputs'
out_dir = out_dir + '/Prudential/'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
out_dir
df.head(2)
# profile = df_misc.profile_report(html={'style': {'full_width':True}})
# profile.to_file(out_dir + 'df_profile.html')
dfs = [df_misc, df_product, df_employement, df_insurance, df_medical]
ofiles = ['df_misc', 'df_product', 'df_employment','df_insurance', 'df_medical']
ofiles = [out_dir + i +'_profile.html' for i in ofiles]
ofiles[0]
profile = df_misc.profile_report(html={'style': {'full_width':True}})
# profile
profile = df_misc.profile_report(explorative=True)
profile
%%time
for dfx,ofile in zip(dfs,ofiles):
dfx = dfx.merge(df[target],left_index=True,right_index=True)
if not os.path.isfile(ofile):
profile = dfx.profile_report(html={'style': {'full_width':True}})
profile.to_file(ofile)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))