In this project we will use multiclass classification to predict one of the 8 possible value of Response.
The data is taken from Kaggle Prudential Life Insurance Project.
About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.
Here category 8 has the highest counts, I assume it the quote that is granted.
Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)
Features:
1 Misc : Age ht wt bmi 4
2 Product Info : Product_Info_1 to 7 7
3 Employment Info : Employment_Info_1 to 6 6
4 Insured Info : InsuredInfo_1 to 7 7
5 Insurance History: Insurance_History_1 to 9 9
6 Family History : Family_Hist_1 to 5 5
7 Medical History : Medical_History_1 to 41 41
8 Medical Keywords : Medical_Keyword_1 to 48 48
Target: Response 1
ID : ID 1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)
Method Used:
Metric Used:
References
Notes about offset
Here, in this project the metric of evaluation is kappa. But when we fit the linear regression using xgboost the loss function is squared error (MSE). The predictions given by optimizing MSE may not be optimal for the evaluation metric kappa.
For the ordinal ranking metric such as kappa, we assume there is parameter space which is more suitable to predictions if we offset the predictions given by MSE. For example, a prediction 1.6 from MSE belongs to class 2. But if we had a offset of 1 for that prediction, then 1.6+1 = 2.6, which becomes class 3. By changing the class we may achieve the better results.
import time
import numpy as np
import pandas as pd
import seaborn as sns
import os
import json
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
SEED=100
home = os.path.expanduser('~')
time_start_notebook = time.time()
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from scipy.optimize import fmin_powell
import xgboost as xgb
xgb.__version__
def data_cleaning():
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
columns_to_drop = ['Id', 'Medical_History_10','Medical_History_24']
df = df.drop(columns_to_drop,axis=1)
df['Product_Info_2_char'] = df.Product_Info_2.str[0]
df['Product_Info_2_num'] = df.Product_Info_2.str[1]
# factorize categorical variables
df['Product_Info_2'] = pd.factorize(df['Product_Info_2'])[0]
df['Product_Info_2_char'] = pd.factorize(df['Product_Info_2_char'])[0]
df['Product_Info_2_num'] = pd.factorize(df['Product_Info_2_num'])[0]
df['BMI_Age'] = df['BMI'] * df['Ins_Age']
med_keyword_columns = df.columns[df.columns.str.startswith('Medical_Keyword_')]
df['Med_Keywords_Count'] = df[med_keyword_columns].sum(axis=1)
df = df.fillna(-1)
return df
df = data_cleaning()
print(df.shape)
df.isna().sum().sum(), df.sum().sum()
from sklearn.model_selection import train_test_split
target = 'Response'
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
df.drop(target,axis=1), df[target],
test_size=0.2, random_state=SEED, stratify=df[target])
ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
dtrain = xgb.DMatrix(df_Xtrain, label=ser_ytrain)
dtest = xgb.DMatrix(df_Xtest, label=ser_ytest)
def eval_wrapper(y, yhat):
# cohens kappa is symmetrics. y <=> yhat gives same result.
y = np.array(y).astype(int)
yhat = np.array(yhat)
yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)
return metrics.cohen_kappa_score(y, yhat,weights='quadratic')
def apply_offsets(offsets,preds):
(x1,x2,x3,x4,x5,x6,x7) = offsets
res = []
for y in list(preds):
if y < x1:
res.append(1)
elif y < x2:
res.append(2)
elif y < x3:
res.append(3)
elif y < x4:
res.append(4)
elif y < x5:
res.append(5)
elif y < x6:
res.append(6)
elif y < x7:
res.append(7)
else: res.append(8)
return res
def digitize_train(guess_lst,train_preds):
(x1,x2,x3,x4,x5,x6,x7) = list(guess_lst)
res = []
for y in list(train_preds):
if y < x1:
res.append(1)
elif y < x2:
res.append(2)
elif y < x3:
res.append(3)
elif y < x4:
res.append(4)
elif y < x5:
res.append(5)
elif y < x6:
res.append(6)
elif y < x7:
res.append(7)
else: res.append(8)
return res
def get_offsets_minimizing_train_preds_kappa(guess_lst):
res = digitize_train(guess_lst,train_preds)
return -quadratic_weighted_kappa(ytrain, res)
def quadratic_weighted_kappa(y_true,y_pred):
return metrics.cohen_kappa_score(y_true,y_pred,weights='quadratic')
df_eval = pd.DataFrame({'Model': [],
'TrainKappa': [],
'TestKappa' : []})
import xgboost
from xgboost import XGBClassifier
xgboost.__version__
params_dict = {'objective': 'reg:squarederror',
'eta': 0.05,
'min_child_weight': 240,
'subsample': 0.9,
'colsample_bytree': 0.67,
'max_depth': 6
}
xgb_num_rounds = 800
%%time
bst = xgb.train(params_dict, dtrain, xgb_num_rounds)
# get preds
train_preds = bst.predict(dtrain, ntree_limit=bst.best_iteration)
test_preds = bst.predict(dtest, ntree_limit=bst.best_iteration)
train_kappa = eval_wrapper(ytrain,train_preds)
test_kappa = eval_wrapper(ytest, test_preds)
row = ['xgb reg', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()
df_eval
%%time
"""
Here, we already have train predictions.
For these train predictions, if we compare them with original train labels,
we get some kappa value. But we want to change the train predictions such
that when comparing this changed train prediction with original train labels
we get better kappa.
For that we use scipy function "fmin_powell". The function needs some initial
guess so that it can give better offset next time. The default guess is 0.5.
For 8 classes (1-8) we can start with (1.5,2.5,...,8.5) then use the result
and run the function again.
""";
x0 = (1.5,2.9,3.1,4.5,5.5,6.1,7.1) # initial guess
# offsets = fmin_powell(get_offsets_minimizing_train_preds_kappa, x0, disp = True)
offsets = [3.117688855474597, 3.574261600706765, 4.347222327043992,
4.919148133534166, 5.529077199779955, 6.1623013715330766,
6.826617448466462]
print(list(offsets))
train_preds = apply_offsets(offsets,train_preds)
train_kappa = quadratic_weighted_kappa(ytrain,train_preds)
test_preds = apply_offsets(offsets,test_preds)
test_kappa = quadratic_weighted_kappa(ytest,test_preds)
row = ['xgb reg + offset', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()
df_eval
objective = "count:poisson"
Who buys life insurance?
Poisson regression is used with COUNT data. And COUNT data describes frequencies of occurrence of a given event/element. In case of this competition, as I understand it, "Response" column should be treated more like a categorical one. Or, if we treat it as a numerical one, some generalized linear models (regressions in general) seem to be more intuitive than Poisson distribution.
One of Poisson distribution's assumption is that the mean and variance of the distribution are equal. If we take a look at the mean and variance for "Response" in train set, they are NOT close. Its counter intutive but still works good.
ytrain.mean(), ytrain.std()
params_dict = {'objective': 'count:poisson',
'eta': 0.05,
'min_child_weight': 240,
'subsample': 0.9,
'colsample_bytree': 0.67,
'max_depth': 6
}
xgb_num_rounds = 800
%%time
bst = xgb.train(params_dict, dtrain, xgb_num_rounds)
# model evaluation for regressor
train_preds = bst.predict(dtrain, ntree_limit=bst.best_iteration)
test_preds = bst.predict(dtest, ntree_limit=bst.best_iteration)
train_kappa = eval_wrapper(ytrain,train_preds)
test_kappa = eval_wrapper(ytest, test_preds)
row = ['xgb poisson', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
# model evaluation for offset
train_preds = apply_offsets(offsets,train_preds)
train_kappa = eval_wrapper(ytrain,train_preds)
test_preds = apply_offsets(offsets,test_preds)
test_kappa = eval_wrapper(ytest,test_preds)
row = ['xgb poisson + offset', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()
df_eval
A voting regressor is an ensemble meta-estimator that fits several base regressors, each on the whole dataset. Then it averages the individual predictions to form a final prediction.
import xgboost as xgb
from sklearn.ensemble import VotingRegressor
params = {'eta': 0.05,
'min_child_weight': 240,
'subsample': 0.9,
'colsample_bytree': 0.67,
'max_depth': 6,
'random_state': SEED
}
xgb_num_rounds = 800
model_xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',**params)
model_xgb_psn = xgb.XGBRegressor(objective='count:poisson',**params)
estimators = [('model1',model_xgb_reg),
('model2',model_xgb_psn)]
ensemble = VotingRegressor(estimators=estimators,n_jobs=-1,weights=[1,1])
ensemble.fit(df_Xtrain, ser_ytrain)
# model evaluation for regressor
train_preds = ensemble.predict(df_Xtrain)
test_preds = ensemble.predict(df_Xtest)
train_kappa = eval_wrapper(ytrain,train_preds)
test_kappa = eval_wrapper(ytest, test_preds)
row = ['ensemble', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
# model evaluation for offset
train_preds = apply_offsets(offsets,train_preds)
train_kappa = eval_wrapper(ytrain,train_preds)
test_preds = apply_offsets(offsets,test_preds)
test_kappa = eval_wrapper(ytest,test_preds)
row = ['ensemble + offset', train_kappa, test_kappa ]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates()
df_eval.style.background_gradient(subset=['TestKappa'])
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))