In this project, we will predict the probability that an auto insurance policy holder files a claim. This a binary classification problem.
We have more than half a million records and 59 features (including already calculated features).
binary features: _bin
categorical features: _cat
continuous or ordinal feafures: ind, reg, car, calc
missing values: -1
Fullforms
ind = individual
reg = registration
car = car
calc = calculated
The target columns signifies whether or not a claim was filed for that policy holder.
import os
import time
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
time_start_notebook = time.time()
SEED=100
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
# Google colab
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
### mount google drive
from google.colab import drive
drive.mount('/content/drive')
### load the data dir
dat_dir = 'drive/My Drive/Colab Notebooks/data/'
sys.path.append(dat_dir)
### Image dir
img_dir = 'drive/My Drive/Colab Notebooks/images/'
if not os.path.isdir(img_dir): os.makedirs(img_dir)
sys.path.append(img_dir)
### Output dir
out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
if not os.path.isdir(out_dir): os.makedirs(out_dir)
sys.path.append(out_dir)
### Also install my custom module
module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/'
sys.path.append(module_dir)
!cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
!pip install -e bhishan
!cd -
### update pandas profiling
###profile = df_misc.profile_report(html={'style': {'full_width':True}})
###profile.to_file(out_dir + 'df_profile.html')
###profile.to_widgets() # not supported in Gcolab just use profile
!pip install -U pandas-profiling # we need restart
import pandas_profiling
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import sklearn
import pandas_profiling
print([(x.__name__, x.__version__) for x in [sklearn, pandas_profiling]])
import bhishan
from bhishan import bp
print(bhishan.__version__)
%load_ext autoreload
%autoreload 2
# note: the kaggle zip file also has _macos__ directory and pandas fails to read
# the zip file. To avoid that, unzip the data, then use terminal to zip data.
# good: zip train.csv.zip train.csv
# bad: right click the train and compress (it will create __macos__ directory).
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/'
'Porto_seguro_safe_driver_prediction/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()
df = df.drop('id',axis=1)
"""
Comment about file size:
The data is large, it has 595k records and 59 features.
ps = porto seguro
_bin = binary feature
_cat = categorical feature
continuous or ordinal: ind, reg, car, calc
""";
target = 'target'
df_small = df.sample(frac=0.05,random_state=SEED)
cols_bin = df.filter(regex='_bin$').columns.to_list()
cols_cat = df.filter(regex='_cat$').columns.to_list()
cols_num = [i for i in df.columns if i not in cols_bin+cols_cat+['target'] ]
cols_float = [i for i in cols_num if df[i].dtype== float ]
cols_int = [i for i in cols_num if df[i].dtype== int ]
cols_reg = df.filter(regex='_reg').columns.to_list()
cols_car = df.filter(regex='_car').columns.to_list()
cols_calc = df.filter(regex='_calc').columns.to_list()
print('bin', len(cols_bin), cols_bin)
print('cat', len(cols_cat), cols_cat)
print('num', len(cols_num), cols_num)
print('float', len(cols_float), cols_float)
print('int', len(cols_int), cols_int)
print('reg', len(cols_reg), cols_reg)
print('car', len(cols_car), cols_car)
print('calc', len(cols_calc), cols_calc)
df.shape[1], len(cols_reg + cols_car + cols_car), len(cols_bin + cols_cat + cols_num)
def create_df_meta(df):
"""Create meta data dataframe.
Reference: https://www.kaggle.com/gpreda/porto-seguro-exploratory-analysis-and-prediction
"""
data = []
for col in df.columns:
# Defining the role
if col == 'target':
use = 'target'
elif col == 'id':
use = 'id'
else:
use = 'input'
# Defining the type
if 'bin' in col or col == 'target':
type = 'binary'
elif 'cat' in col or col == 'id':
type = 'categorical'
elif df[col].dtype == float or isinstance(df[col].dtype, float):
type = 'real'
elif df[col].dtype == int:
type = 'integer'
# Initialize preserve to True for all variables except for id
preserve = True
if col == 'id':
preserve = False
# Defining the data type
dtype = df[col].dtype
category = 'none'
# Defining the category
if 'ind' in col:
category = 'individual'
elif 'reg' in col:
category = 'registration'
elif 'car' in col:
category = 'car'
elif 'calc' in col:
category = 'calculated'
# Creating a Dict that contains all the df_meta for the variable
col_dict = {
'varname': col,
'use': use,
'type': type,
'preserve': preserve,
'dtype': dtype,
'category' : category
}
data.append(col_dict)
df_meta = pd.DataFrame(data, columns=['varname', 'use', 'type',
'preserve', 'dtype', 'category'])
df_meta.set_index('varname', inplace=True)
return df_meta
df_meta = create_df_meta(df)
df_meta.head()
df_meta.groupby('category').count()[['use']]
# We have 20 calculated features, 16 car, 18 individual and 3 registration.
df_meta.groupby(['use','type']).count().reset_index().iloc[:,:3]
# nans are coded with -1, we need to put back np.nan
# so that we can better impute the nans.
df = df.replace(-1,np.nan)
df.isna().sum().sum()
df_missing = df.bp.missing()
cols_drop = ['ps_car_03_cat']
cols_car = [i for i in cols_car if i not in cols_drop]
cols_cat = [i for i in cols_cat if i not in cols_drop]
df_meta.loc[(cols_drop),'keep'] = False
df = df.drop(cols_drop,axis=1)
cols_missing_low = ['ps_car_05_cat',
'ps_reg_03', 'ps_car_14',
'ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat',
'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat',
'ps_car_02_cat',
'ps_car_11', 'ps_car_12']
cols_missing_num = ['ps_reg_03', 'ps_car_14','ps_car_11', 'ps_car_12' ]
for col in cols_missing_num:
df[col] = df[col].fillna(df[col].mean())
cols_missing_cat = ['ps_car_05_cat',
'ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat',
'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat',
'ps_car_02_cat']
for col in cols_missing_cat:
df[col] = df[col].fillna(df[col].mode()[0])
df.isna().sum().sum()
df.bp.plot_cat(target,figsize=(8,4))
ps_reg_01 ps_reg_02 ps_reg_03
df[cols_reg].head()
df[cols_reg].bp.describe()
# reg 01 and 02 are features with denominator 10. eg. 0.1 0.2 0.3 etc.
# reg 01 has only values 0.0 to 0.9 (no 1.0 and only 10 unique values.)
df[cols_reg].merge(df[target],left_index=True,right_index=True).bp.plot_corr()
car 16
['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']
df[cols_car].head()
df[cols_car].bp.describe(sort_col='index')
# Note:
# ps_car_12 are approxly square roots (divided by 10) of natural numbers.
# ps_car_15 are square roots of natural numbers.
sample = df.sample(frac=0.05,random_state=SEED)
var = ['ps_car_12', 'ps_car_15']
sample = sample[var].copy()
sample['ps_car_12_sq'] = sample['ps_car_12'] ** 2 * 10
sample['ps_car_15_sq'] = sample['ps_car_15'] ** 2
sample.head()
sample[['ps_car_12_sq','ps_car_15_sq']].nunique()
df[cols_car].merge(df[target],left_index=True,
right_index=True).bp.plot_corr(xrot=90,figsize=(18,8))
calc 20
['ps_calc_01', 'ps_calc_02', 'ps_calc_03',
'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
'ps_calc_10', 'ps_calc_11', 'ps_calc_12',
'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
'ps_calc_16_bin', 'ps_calc_17_bin',
'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
df[cols_calc].head()
df[cols_calc].bp.describe()
df_meta.head()
# cols_real_all = df.select_dtypes('float').columns.to_list()
# cols_real = [ i for i in cols_real_all if not i.endswith('_cat')]
# len(cols_real), cols_real
# note: ps_car_11 was integer but after imputing nan, it became float
cols_real = df_meta.query(""" type == 'real' and preserve == True """).index.to_list()
print(len(cols_real), cols_real)
df.bp.compare_kde(cols_real,target,3,4)
# Useful feature that shows different kde plots for target predictions are
# reg01 reg02 reg03
# car13 car14 car15
# other real number features are not much that useful.
df.sample(frac=0.1,random_state=SEED).bp.plot_num('ps_reg_02')
df.sample(frac=0.1,random_state=SEED).bp.plot_num('ps_car_15')
df.sample(frac=0.1,random_state=SEED).bp.plot_num_cat('ps_reg_02','target')
df.sample(frac=0.1,random_state=SEED).bp.plot_num_cat('ps_car_15','target')
df[cols_real].merge(df[target],left_index=True,
right_index=True).bp.plot_corr(xrot=90,figsize=(18,8))
# Look at highly correlated features
df_high_corr = df[cols_real].bp.corr_high(thr=0.4)
cols_high_corr = ['ps_reg_02', 'ps_reg_03', 'ps_car_14', 'ps_car_12',
'ps_reg_01', 'ps_car_13', 'ps_car_15'] + [target]
sample = df_small[cols_high_corr]
sample.head(2)
sns.pairplot(sample, hue='target', palette = 'Set1', diag_kind='kde')
cols_high_corr1 = ['ps_reg_03', 'ps_car_12', 'ps_car_14', 'ps_car_15', 'ps_reg_02', 'ps_car_14']
cols_high_corr2 = ['ps_reg_02', 'ps_car_13', 'ps_car_12', 'ps_car_13', 'ps_reg_01', 'ps_car_13']
print(cols_high_corr1)
print(cols_high_corr2)
plt.figure(figsize=(12,8))
sns.lmplot(x='ps_reg_02',y='ps_reg_03',data=sample,
hue=target, palette='Set1',
scatter_kws={'alpha':0.3})
plt.xlim(-0.1,2)
plt.tight_layout()
plt.show()
sample.bp.regplot_binn(cols_high_corr1,cols_high_corr2,target,2,3)
# reg02 and reg03 has almost same relation with target, one of them may be dropped.
# car12 and car14 has almost same relation with target, one of them may be dropped.
cols_bin = df.filter(regex='_bin$').columns.to_list()
print(len(cols_bin), cols_bin)
df[cols_bin].head(2)
df.bp.plot_cat_cat(cols_bin[0],target)
(df[cols_bin]
.apply(lambda x: x.value_counts(normalize=True))
.T.plot(kind='bar', stacked=True,color=['tomato','teal'])
)
plt.xlabel('Binary Features')
plt.tight_layout()
plt.title('Proportion of Binary Features')
plt.show()
# df.bp.plot_cat_stacked(cols_bin)
len(cols_bin)
df.bp.compare_kde(cols_bin,target,6,3,figsize=(24,18))
cols_cat = df.filter(regex='_cat$').columns.to_list()
print(len(cols_cat), cols_cat)
df[cols_cat].head(2)
df[cols_cat].nunique().sort_values()
cols_cat_small = df[cols_cat].nunique().sort_values().loc[lambda x: x<=10].index.to_list()
print(len(cols_cat_small), cols_cat_small)
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
df.bp.countplot(cols_cat_small,3,4)
df[cols_cat_small].head(2)
We can look at the overview of the data such as histogram, missing values, correlation, skewness using pandas_profiling module. Bear in mind that it may take long time to produce results. So saving them in a output file and only producing them once is a good way to go.
import pandas_profiling
out_dir = '../images/Porto/'
if ENV_COLAB:
out_dir = 'drive/My Drive/Colab Notebooks/outputs'
out_dir = out_dir + '/Porto/'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
out_dir
df.head(2)
# profile = df_misc.profile_report(html={'style': {'full_width':True}})
# profile.to_file(out_dir + 'df_profile.html')
dfs = [df[cols_bin + [target]],
df[cols_cat + [target]],
df[cols_num + [target]],
]
ofiles = ['df_bin', 'df_cat', 'df_num']
ofiles = [out_dir + i +'_profile.html' for i in ofiles]
ofiles[0]
profile = dfs[0].profile_report(html={'style': {'full_width':True}})
# profile
profile = dfs[0].profile_report(explorative=True)
# profile
%%time
for dfx,ofile in zip(dfs,ofiles):
dfx = dfx.merge(df[target],left_index=True,right_index=True)
if not os.path.isfile(ofile):
profile = dfx.profile_report(html={'style': {'full_width':True}})
profile.to_file(ofile)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))