Description¶

In this project we will use multiclass classification to predict one of the 8 possible value of Response.

The data is taken from Kaggle Prudential Life Insurance Project.

About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.

Here category 8 has the highest counts, I assume it the quote that is granted.

Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)

Features:

1 Misc             : Age ht wt bmi              4
2 Product Info     : Product_Info_1 to 7        7
3 Employment Info  : Employment_Info_1 to 6     6
4 Insured Info     : InsuredInfo_1 to 7         7
5 Insurance History: Insurance_History_1 to 9   9
6 Family History   : Family_Hist_1 to 5         5
7 Medical History  : Medical_History_1 to 41    41
8 Medical Keywords : Medical_Keyword_1 to 48    48
Target: Response                                1
ID    : ID                                      1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)

Method Used:

XGBoost

Metric Used:

Weighted Quadratic Kappa (cohehs kappa with weight equals quadratic)

References

https://www.kaggle.com/zeroblue/xgboost-with-optimized-offsets

Notes about offset
Here, in this project the metric of evaluation is kappa. But when we fit the linear regression using xgboost the loss function is squared error (MSE). The predictions given by optimizing MSE may not be optimal for the evaluation metric kappa.

For the ordinal ranking metric such as kappa, we assume there is parameter space which is more suitable to predictions if we offset the predictions given by MSE. For example, a prediction 1.6 from MSE belongs to class 2. But if we had a offset of 1 for that prediction, then 1.6+1 = 2.6, which becomes class 3. By changing the class we may achieve the better results.

Imports¶

import time
import numpy as np
import pandas as pd
import seaborn as sns
import os
import json
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 
SEED=100
home = os.path.expanduser('~')
time_start_notebook = time.time()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from scipy.optimize import fmin_powell
import xgboost as xgb
xgb.__version__

'0.90'

Data Cleaning¶

def data_cleaning():
    df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
    columns_to_drop = ['Id', 'Medical_History_10','Medical_History_24']
    df = df.drop(columns_to_drop,axis=1)
    df['Product_Info_2_char'] = df.Product_Info_2.str[0]
    df['Product_Info_2_num'] = df.Product_Info_2.str[1]

    # factorize categorical variables
    df['Product_Info_2'] = pd.factorize(df['Product_Info_2'])[0]
    df['Product_Info_2_char'] = pd.factorize(df['Product_Info_2_char'])[0]
    df['Product_Info_2_num'] = pd.factorize(df['Product_Info_2_num'])[0]

    df['BMI_Age'] = df['BMI'] * df['Ins_Age']

    med_keyword_columns = df.columns[df.columns.str.startswith('Medical_Keyword_')]
    df['Med_Keywords_Count'] = df[med_keyword_columns].sum(axis=1)
    df = df.fillna(-1)

    return df

df = data_cleaning()
print(df.shape)
df.isna().sum().sum(), df.sum().sum()

(59381, 129)

(0, 26897356.818315115)

Train Test Split with Stratify¶

from sklearn.model_selection import train_test_split

target = 'Response'
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],
    test_size=0.2, random_state=SEED, stratify=df[target])


ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

dtrain = xgb.DMatrix(df_Xtrain, label=ser_ytrain)
dtest = xgb.DMatrix(df_Xtest, label=ser_ytest)

Evaluation Metric¶

def eval_wrapper(y, yhat):
    # cohens kappa is symmetrics. y <=> yhat gives same result.
    y = np.array(y).astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return metrics.cohen_kappa_score(y, yhat,weights='quadratic')

Useful Functions¶

def apply_offsets(offsets,preds):
    (x1,x2,x3,x4,x5,x6,x7) = offsets  
    res = []
    for y in list(preds):
        if y < x1:
            res.append(1)
        elif y < x2:
            res.append(2)
        elif y < x3:
            res.append(3)
        elif y < x4:
            res.append(4)
        elif y < x5:
            res.append(5)
        elif y < x6:
            res.append(6)
        elif y < x7:
            res.append(7)
        else: res.append(8)
    return res

def digitize_train(guess_lst,train_preds):
    (x1,x2,x3,x4,x5,x6,x7) = list(guess_lst)   
    res = []
    for y in list(train_preds):
        if y < x1:
            res.append(1)
        elif y < x2:
            res.append(2)
        elif y < x3:
            res.append(3)
        elif y < x4:
            res.append(4)
        elif y < x5:
            res.append(5)
        elif y < x6:
            res.append(6)
        elif y < x7:
            res.append(7)
        else: res.append(8)
    return res

def get_offsets_minimizing_train_preds_kappa(guess_lst):
    res = digitize_train(guess_lst,train_preds)
    return -quadratic_weighted_kappa(ytrain, res)

def quadratic_weighted_kappa(y_true,y_pred):
    return metrics.cohen_kappa_score(y_true,y_pred,weights='quadratic')

df_eval = pd.DataFrame({'Model': [],
                        'TrainKappa': [],
                        'TestKappa' : []})

Modelling xgboost classifier¶

import xgboost
from xgboost import XGBClassifier

xgboost.__version__

'0.90'

params_dict = {'objective': 'reg:squarederror',
              'eta': 0.05,
              'min_child_weight': 240,
              'subsample': 0.9,
              'colsample_bytree': 0.67,
              'max_depth': 6
}
xgb_num_rounds = 800

%%time
bst = xgb.train(params_dict, dtrain, xgb_num_rounds)

CPU times: user 3min 29s, sys: 567 ms, total: 3min 30s
Wall time: 3min 33s

Model Evaluation¶

# get preds
train_preds = bst.predict(dtrain, ntree_limit=bst.best_iteration)
test_preds = bst.predict(dtest, ntree_limit=bst.best_iteration)

train_kappa = eval_wrapper(ytrain,train_preds)
test_kappa = eval_wrapper(ytest, test_preds)

row = ['xgb reg', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()
df_eval

Find Offsets for Train¶

%%time
"""
Here, we already have train predictions.
For these train predictions, if we compare them with original train labels,
we get some kappa value. But we want to change the train predictions such
that when comparing this changed train prediction with original train labels
we get better kappa.

For that we use scipy function "fmin_powell". The function needs some initial
guess so that it can give better offset next time. The default guess is 0.5.
For 8 classes (1-8) we can start with (1.5,2.5,...,8.5) then use the result
and run the function again.

""";
x0 = (1.5,2.9,3.1,4.5,5.5,6.1,7.1)    # initial guess 
# offsets = fmin_powell(get_offsets_minimizing_train_preds_kappa, x0, disp = True)

offsets = [3.117688855474597, 3.574261600706765, 4.347222327043992, 
           4.919148133534166, 5.529077199779955, 6.1623013715330766, 
           6.826617448466462]
print(list(offsets))

[3.117688855474597, 3.574261600706765, 4.347222327043992, 4.919148133534166, 5.529077199779955, 6.1623013715330766, 6.826617448466462]
CPU times: user 186 µs, sys: 42 µs, total: 228 µs
Wall time: 208 µs

Model evaluation after applying offset¶

train_preds = apply_offsets(offsets,train_preds)
train_kappa = quadratic_weighted_kappa(ytrain,train_preds)

test_preds = apply_offsets(offsets,test_preds)
test_kappa = quadratic_weighted_kappa(ytest,test_preds)

row = ['xgb reg + offset', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()
df_eval

Using Poission Regression¶

objective = "count:poisson"

Who buys life insurance?

Future planning individual.
Company buys for thier employees.

Poisson regression is used with COUNT data. And COUNT data describes frequencies of occurrence of a given event/element. In case of this competition, as I understand it, "Response" column should be treated more like a categorical one. Or, if we treat it as a numerical one, some generalized linear models (regressions in general) seem to be more intuitive than Poisson distribution.

One of Poisson distribution's assumption is that the mean and variance of the distribution are equal. If we take a look at the mean and variance for "Response" in train set, they are NOT close. Its counter intutive but still works good.

ytrain.mean(), ytrain.std()

(5.636767430111148, 2.4568669703520585)

params_dict = {'objective': 'count:poisson',
              'eta': 0.05,
              'min_child_weight': 240,
              'subsample': 0.9,
              'colsample_bytree': 0.67,
              'max_depth': 6
}
xgb_num_rounds = 800

%%time
bst = xgb.train(params_dict, dtrain, xgb_num_rounds)

CPU times: user 3min 57s, sys: 805 ms, total: 3min 58s
Wall time: 4min 20s

# model evaluation for regressor
train_preds = bst.predict(dtrain, ntree_limit=bst.best_iteration)
test_preds = bst.predict(dtest, ntree_limit=bst.best_iteration)
train_kappa = eval_wrapper(ytrain,train_preds)
test_kappa = eval_wrapper(ytest,  test_preds)

row = ['xgb poisson', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row

# model evaluation for offset
train_preds = apply_offsets(offsets,train_preds)
train_kappa = eval_wrapper(ytrain,train_preds)

test_preds = apply_offsets(offsets,test_preds)
test_kappa = eval_wrapper(ytest,test_preds)

row = ['xgb poisson + offset', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()
df_eval

Ensemble Voting Regressor¶

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html

A voting regressor is an ensemble meta-estimator that fits several base regressors, each on the whole dataset. Then it averages the individual predictions to form a final prediction.

import xgboost as xgb
from sklearn.ensemble import VotingRegressor

params = {'eta': 0.05,
              'min_child_weight': 240,
              'subsample': 0.9,
              'colsample_bytree': 0.67,
              'max_depth': 6,
              'random_state': SEED
}
xgb_num_rounds = 800

model_xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',**params)
model_xgb_psn = xgb.XGBRegressor(objective='count:poisson',**params)

estimators = [('model1',model_xgb_reg),
              ('model2',model_xgb_psn)]

ensemble = VotingRegressor(estimators=estimators,n_jobs=-1,weights=[1,1])

ensemble.fit(df_Xtrain, ser_ytrain)

VotingRegressor(estimators=[('model1',
                             XGBRegressor(colsample_bytree=0.67, eta=0.05,
                                          max_depth=6, min_child_weight=240,
                                          objective='reg:squarederror',
                                          random_state=100, subsample=0.9)),
                            ('model2',
                             XGBRegressor(colsample_bytree=0.67, eta=0.05,
                                          max_depth=6, min_child_weight=240,
                                          objective='count:poisson',
                                          random_state=100, subsample=0.9))],
                n_jobs=-1, weights=[1, 1])

# model evaluation for regressor
train_preds = ensemble.predict(df_Xtrain)
test_preds = ensemble.predict(df_Xtest)

train_kappa = eval_wrapper(ytrain,train_preds)
test_kappa = eval_wrapper(ytest, test_preds)

row = ['ensemble', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row

# model evaluation for offset
train_preds = apply_offsets(offsets,train_preds)
train_kappa = eval_wrapper(ytrain,train_preds)

test_preds = apply_offsets(offsets,test_preds)
test_kappa = eval_wrapper(ytest,test_preds)

row = ['ensemble + offset', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()
df_eval.style.background_gradient(subset=['TestKappa'])

Time Taken¶

time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 9 min 1 secs

	Model	TrainKappa	TestKappa
0	xgb reg	0.669651	0.603765
1	xgb reg + offset	0.720368	0.649496
2	xgb poisson	0.682188	0.609387
3	xgb poisson + offset	0.735050	0.655627

	Model	TrainKappa	TestKappa
0	xgb reg	0.669651	0.603765
1	xgb reg + offset	0.720368	0.649496
2	xgb poisson	0.682188	0.609387
3	xgb poisson + offset	0.735050	0.655627
4	ensemble	0.623919	0.593312
5	ensemble + offset	0.683268	0.644076

Table of Contents