In this project we will use multiclass classification to predict one of the 8 possible value of Response.
The data is taken from Kaggle Prudential Life Insurance Project.
About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.
Here category 8 has the highest counts, I assume it the quote that is granted.
Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)
Features:
1 Misc : Age ht wt bmi 4
2 Product Info : Product_Info_1 to 7 7
3 Employment Info : Employment_Info_1 to 6 6
4 Insured Info : InsuredInfo_1 to 7 7
5 Insurance History: Insurance_History_1 to 9 9
6 Family History : Family_Hist_1 to 5 5
7 Medical History : Medical_History_1 to 41 41
8 Medical Keywords : Medical_Keyword_1 to 48 48
Target: Response 1
ID : ID 1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)
Method Used:
Metric Used:
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
### mount google drive
from google.colab import drive
drive.mount('/content/drive')
### load the data dir
dat_dir = 'drive/My Drive/Colab Notebooks/data/'
sys.path.append(dat_dir)
### Image dir
img_dir = 'drive/My Drive/Colab Notebooks/images/'
if not os.path.isdir(img_dir): os.makedirs(img_dir)
sys.path.append(img_dir)
### Output dir
out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
if not os.path.isdir(out_dir): os.makedirs(out_dir)
sys.path.append(out_dir)
### Also install my custom module
module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/'
sys.path.append(module_dir)
!cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
!pip install -e bhishan
!cd -
import bhishan
from bhishan import bp
## upgrade
!pip install watermark
!pip install -U xgboost
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import time
import numpy as np
import pandas as pd
import seaborn as sns
import os
import json
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# random state
SEED=100
time_start_notebook = time.time()
home = os.path.expanduser('~')
[(x.__name__,x.__version__) for x in [np,pd,sns]]
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import ColumnSelector
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
import xgboost as xgb
xgb.__version__
def data_cleaning():
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
df = df.copy()
columns_to_drop = ['Id', 'Medical_History_10','Medical_History_24']
df = df.drop(columns_to_drop,axis=1)
df['Product_Info_2_char'] = df.Product_Info_2.str[0]
df['Product_Info_2_num'] = df.Product_Info_2.str[1]
# factorize categorical variables
df['Product_Info_2'] = pd.factorize(df['Product_Info_2'])[0]
df['Product_Info_2_char'] = pd.factorize(df['Product_Info_2_char'])[0]
df['Product_Info_2_num'] = pd.factorize(df['Product_Info_2_num'])[0]
df['BMI_Age'] = df['BMI'] * df['Ins_Age']
med_keyword_columns = df.columns[df.columns.str.startswith('Medical_Keyword_')]
df['Med_Keywords_Count'] = df[med_keyword_columns].sum(axis=1)
df = df.fillna(-1)
return df
df = data_cleaning()
print(df.shape)
df.isna().sum().sum(), df.sum().sum()
def get_already_cleaned_data():
file_data = out_dir + 'Prudential/' + 'clean_data.csv'
df = pd.read_csv(file_data,compression='zip')
df = df.drop('Id',axis=1)
file_features = out_dir + 'Prudential/'+'categorical_features.json'
cols_cat = json.load(open(file_features))
df = pd.get_dummies(df,columns=cols_cat,drop_first=True)
return df
# this gives better result.
# df = get_already_cleaned_data()
# print(df.shape)
# df.isna().sum().sum(), df.sum().sum()
# make sure all values are number and there are no nans
df.isna().sum().sum()
from sklearn.model_selection import train_test_split
target = 'Response'
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1), df[target],
test_size=0.2, random_state=SEED, stratify=df[target])
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig, ser_ytrain_orig,
test_size=0.2, random_state=SEED, stratify=ser_ytrain_orig)
ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
print(f"df : {df.shape}")
print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")
print(f"\ndf_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}")
print(f"\ndf_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}")
print(f"\ndf_Xtest : {df_Xtest.shape}")
print(f"ser_ytest : {ser_ytest.shape}")
df_Xtrain_orig.head(2)
# dtrain = xgb.DMatrix(df_Xtrain, label=ser_ytrain)
# dvalid = xgb.DMatrix(df_Xvalid, label=ser_yvalid)
# dtest = xgb.DMatrix(df_Xtest, label=ser_ytest)
# https://www.kaggle.com/chenglongchen/customized-softkappa-loss-in-xgboost
def softmax(score):
score = np.asarray(score, dtype=float)
score = np.exp(score-np.max(score))
score /= np.sum(score, axis=1)[:,np.newaxis]
return score
def evalerror(preds, dtrain):
# xgboost multiclass gives 0,1,2,3 but our labels are 1 to 8.
labels = dtrain.get_label() + 1 # we need to add 1
preds = softmax(preds) # class probability
pred_labels = np.argmax(preds, axis=1) + 1
# xgboost only miminimizes loss function
kappa = - metrics.cohen_kappa_score(labels, pred_labels,weights='quadratic')
return 'kappa', kappa
NOTE
Unlike CatBoost or LGBM, XGBoost cannot handle categorical features by itself, it only accepts numerical values similar to Random Forest. Therefore one has to perform various encodings like label encoding, mean encoding or one-hot encoding before supplying categorical data to XGBoost.
Multiclass classification:
“multi:softmax” –set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes)
“multi:softprob” –same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class.
import xgboost
from xgboost import XGBClassifier
xgboost.__version__
clf_xgb = XGBClassifier(objective= 'multi:softprob', random_state=SEED,n_jobs=-1)
clf_xgb
%%time
def run_default():
clf_xgb = XGBClassifier(objective= 'multi:softprob', random_state=SEED,n_jobs=-1)
# default eval metric
clf_xgb.fit(df_Xtrain,ser_ytrain,
eval_set=[(df_Xvalid,ser_yvalid)],
early_stopping_rounds=10,
)
ypreds = clf_xgb.predict(df_Xtest)
score = metrics.cohen_kappa_score(ytest,ypreds,weights='quadratic')
print(score)
# run_default()
"""
For simple cleaned data
=========================
0.5377897181694622
CPU times: user 1min 13s, sys: 50.9 ms, total: 1min 13s
Wall time: 1min 13s
For already cleaned data
========================
0.5438646999324421
CPU times: user 7min 15s, sys: 245 ms, total: 7min 15s
Wall time: 7min 15s
""";
%%time
def run_custom():
clf_xgb = XGBClassifier(objective= 'multi:softprob', random_state=SEED,n_jobs=-1,
verbose=10)
clf_xgb.fit(df_Xtrain,ser_ytrain,
eval_set=[(df_Xvalid,ser_yvalid)],
eval_metric=evalerror,
early_stopping_rounds=10,
)
ypreds = clf_xgb.predict(df_Xtest)
score = metrics.cohen_kappa_score(ytest,ypreds,weights='quadratic')
print(score)
"""
For simple clean data
=========================
0.5530053047703208
CPU times: user 2min 33s, sys: 115 ms, total: 2min 33s
Wall time: 2min 33s
For already cleaned data
========================
kappa = 0.5407784634778012
""";
# run_custom()
"""
For simple cleaned data
======================================================
objectiveFunction evalMetric resultKappa
softmax default 0.5377897181694622
softmax custom 0.5530053047703208
Here, custom metric gives the best result.
For already cleaned data
======================================================
objectiveFunction evalMetric resultKappa
softmax default 0.5438646999324421
softmax custom 0.5407784634778012
Surprisingly, the default metric gives better validation result. It might be
due to our objective function is softmax but eval metric is kappa instead of
default eval metric provided by xgboost.
I have got different results for using custom/default metric and
simple/detail data cleaning methods.
We can not determine which features are important until we do further
feature engineering. Also, we need to tune the hyperparameters.
""";
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))