Description¶

In this project we will use multiclass classification to predict one of the 8 possible value of Response.

The data is taken from Kaggle Prudential Life Insurance Project.

About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.

Here category 8 has the highest counts, I assume it the quote that is granted.

Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)

Features:

1 Misc             : Age ht wt bmi              4
2 Product Info     : Product_Info_1 to 7        7
3 Employment Info  : Employment_Info_1 to 6     6
4 Insured Info     : InsuredInfo_1 to 7         7
5 Insurance History: Insurance_History_1 to 9   9
6 Family History   : Family_Hist_1 to 5         5
7 Medical History  : Medical_History_1 to 41    41
8 Medical Keywords : Medical_Keyword_1 to 48    48
Target: Response                                1
ID    : ID                                      1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)

Method Used:

Linear Regression (not LogisticRegression). Use linear regression and convert predictions to nearest integer and then evaluate the weighted quadratic kappa.
Various sklearn classifiers

Imports¶

%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:

    !pip install watermark
    !pip install -U lrcurve

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.

import time
import numpy as np
import pandas as pd
import seaborn as sns
import os
import json
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 
# random state
SEED=100
time_start_notebook = time.time()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import ColumnSelector
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

%load_ext watermark
%watermark -a "Bhishan Poudel" -dvm
%watermark -iv

Bhishan Poudel 2020-06-22 

CPython 3.7.7
IPython 7.13.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
numpy    1.18.4
pandas   1.0.3
autopep8 1.5.2
seaborn  0.10.1
json     2.0.9

Notes¶

metrics.plot_roc_curve only works for classifier, does not work for pipeline object.
For linear regression normalization of the data is important unlike classification.

Load the data¶

def data_cleaning():
    df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
    df = df.copy()
    columns_to_drop = ['Id', 'Medical_History_10','Medical_History_24']
    df = df.drop(columns_to_drop,axis=1)
    df['Product_Info_2_char'] = df.Product_Info_2.str[0]
    df['Product_Info_2_num'] = df.Product_Info_2.str[1]

    # factorize categorical variables
    df['Product_Info_2'] = pd.factorize(df['Product_Info_2'])[0]
    df['Product_Info_2_char'] = pd.factorize(df['Product_Info_2_char'])[0]
    df['Product_Info_2_num'] = pd.factorize(df['Product_Info_2_num'])[0]

    df['BMI_Age'] = df['BMI'] * df['Ins_Age']

    med_keyword_columns = df.columns[df.columns.str.startswith('Medical_Keyword_')]
    df['Med_Keywords_Count'] = df[med_keyword_columns].sum(axis=1)
    df = df.fillna(-1)

    return df

df = data_cleaning()
print(df.shape)
df.isna().sum().sum(), df.sum().sum()

(59381, 129)

(0, 26897356.818315115)

# make sure all values are number and there are no nans
df.sum().sum(), df.isna().sum().sum()

(26897356.818315115, 0)

Train-test Split with Stratify¶

from sklearn.model_selection import train_test_split

target = 'Response'

df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],
    test_size=0.2, random_state=SEED, stratify=df[target])

ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

print(f"df             : {df.shape}")

print(f"\ndf_Xtrain      : {df_Xtrain.shape}")
print(f"ser_ytrain     : {ser_ytrain.shape}")

print(f"\ndf_Xtest       : {df_Xtest.shape}")
print(f"ser_ytest      : {ser_ytest.shape}")

df_Xtrain.head(2)

df             : (59381, 129)

df_Xtrain      : (47504, 128)
ser_ytrain     : (47504,)

df_Xtest       : (11877, 128)
ser_ytest      : (11877,)

Custom Evaluation Metric¶

def weighted_quadratic_kappa_score(y_true,y_probs):
    y_preds = np.digitize(y_probs,range(1,9))
    score = metrics.cohen_kappa_score(y_true,y_preds,weights='quadratic')
    return score

weighted_quadratic_kappa_scorer = make_scorer(
    weighted_quadratic_kappa_score, greater_is_better=True)

Cross validation for Logistic Regression¶

from sklearn.model_selection import validation_curve
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0,n_jobs=-1,random_state=SEED)
clf

LogisticRegression(n_jobs=-1, random_state=100)

%%time
skf = StratifiedKFold(n_splits=10, random_state=SEED,shuffle=True)
train_scores, valid_scores = validation_curve(clf, df_Xtrain, ytrain,
    "C", np.logspace(-7, 1, 5), # C = 1e-7 etc
    cv=skf,n_jobs=-1,
    scoring=weighted_quadratic_kappa_scorer)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/validation.py:71: FutureWarning: Pass param_name=C, param_range=[1.e-07 1.e-05 1.e-03 1.e-01 1.e+01] as keyword args. From version 0.25 passing these as positional arguments will result in an error
  FutureWarning)

CPU times: user 360 ms, sys: 97.3 ms, total: 457 ms
Wall time: 1min 24s

kappa = train_scores.mean()
print(f'mean kappa for training: {kappa}')

mean kappa for training: 0.1407235231062385

Validation Curve Animation Using lrcurve¶

train_loss = train_scores[-1]
valid_loss = valid_scores[-1]
train_loss,valid_loss

(array([0.1918003 , 0.227604  , 0.20403805, 0.23129364, 0.19880397,
        0.20993251, 0.23962707, 0.22444946, 0.22108122, 0.22851729]),
 array([0.17093059, 0.23216766, 0.20568543, 0.24062541, 0.2014586 ,
        0.18975827, 0.22966337, 0.24426233, 0.22984852, 0.2152856 ]))

try:
    from lrcurve import PlotLearningCurve

    with PlotLearningCurve() as plot:
        for i in range(len(train_loss)):
            plot.append(i, {
                'loss': train_loss[i],
                'val_loss': valid_loss[i]
            })
            plot.draw()
            time.sleep(0.5)
except:
    pass # tensorflow not installed.

Scikit-learn Classifiers¶

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model_names = []
cv_scores_mean = []
cv_scores_std = []

Logistic Regression¶

clf_lr = LogisticRegression(solver='lbfgs',
                            max_iter=1000,
                            random_state=SEED,
                            verbose=0,
                            n_jobs=-1) # for liblinear n_jobs is +1.

skf = StratifiedKFold(n_splits=3, random_state=SEED,shuffle=True)

t0 = time.time()
clf = clf_lr
clf.fit(df_Xtrain, ytrain)
score = cross_val_score(clf, df_Xtrain, ytrain, cv=skf,n_jobs=-1,verbose=0,
                            scoring=weighted_quadratic_kappa_scorer)
score_mean = score.mean()
score_std = score.std()

model_names.append('Logistic Regression, max_iter=1000')
cv_scores_mean.append(score_mean)
cv_scores_std.append(score_std)
print(f'score mean = {score_mean}')
print(f'score std = {score_std}')

t1 = (time.time() - t0)
print('Time taken: {} minutes {:.2f} seconds'.format(*divmod(t1,60)))

score mean = 0.40453747272195306
score std = 0.007344148538625899
Time taken: 1.0 minutes 43.00 seconds

Random Forest¶

clf_rf = RandomForestClassifier(n_estimators=100,verbose=0,
                                random_state=SEED,n_jobs=-1)

skf = StratifiedKFold(n_splits=3, random_state=SEED,shuffle=True)

t0 = time.time()
clf_rf.fit(df_Xtrain, ytrain)
score = cross_val_score(clf_rf, df_Xtrain, ytrain, cv=skf,n_jobs=-1,verbose=1,
                        scoring=weighted_quadratic_kappa_scorer)
score_mean = score.mean()
score_std = score.std()
model_names.append('Random Forest, n_estimators=100')
cv_scores_mean.append(score_mean)
cv_scores_std.append(score_std)

print(f'score mean = {score_mean}')
print(f'score std = {score_std}')

t1 = (time.time() - t0)
print('Time taken: {} minutes {:.2f} seconds'.format(*divmod(t1,60)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.

score mean = 0.5077497413959793
score std = 0.005567258127862545
Time taken: 0.0 minutes 22.68 seconds

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   15.9s finished

Model Evaluation¶

Kappa¶

%%time
ypreds = clf_rf.predict(df_Xtest)
print(ypreds[:5])

score = metrics.cohen_kappa_score(ytest,ypreds,weights='quadratic')
print(f'Weighted quadratic kappa = {score}')

[2 2 8 6 7]
Weighted quadratic kappa = 0.5051797076629455
CPU times: user 721 ms, sys: 23.5 ms, total: 745 ms
Wall time: 450 ms

Confusion Matrix¶

cm = metrics.confusion_matrix(ytest,ypreds)
print(cm)

[[ 237  161   10   22  102  307  107  295]
 [ 144  296    7   17  161  317   95  273]
 [  20   19   30   38   21   58    3   14]
 [  16    2    4  131    0   81    5   47]
 [  53   91    1    0  568  226   48   99]
 [  94   77    0    4   86 1241  239  506]
 [  39    8    0    1   12  411  549  586]
 [  14    6    0    2    7  231  125 3513]]

df_cm = pd.DataFrame(cm, index=range(1,9),columns=range(1,9))
df_cm.style.background_gradient().set_caption('Predicted Label')

disp = metrics.plot_confusion_matrix(clf_rf, df_Xtest, ytest,
                display_labels=[f'Class {i+1}' for i in range(8)],
                values_format='d',
                cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix')
plt.show()

Classification Report¶

from sklearn.metrics import classification_report

target_names = [f'class {i+1}' for i in range(8)]
print(classification_report(ytest, ypreds, target_names=target_names))

              precision    recall  f1-score   support

     class 1       0.38      0.19      0.26      1241
     class 2       0.45      0.23      0.30      1310
     class 3       0.58      0.15      0.24       203
     class 4       0.61      0.46      0.52       286
     class 5       0.59      0.52      0.56      1086
     class 6       0.43      0.55      0.48      2247
     class 7       0.47      0.34      0.40      1606
     class 8       0.66      0.90      0.76      3898

    accuracy                           0.55     11877
   macro avg       0.52      0.42      0.44     11877
weighted avg       0.53      0.55      0.52     11877

Time Taken¶

time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 3 min 36 secs

	Product_Info_1	Product_Info_2	Product_Info_3	Product_Info_4	Product_Info_5	Product_Info_6	Product_Info_7	Ins_Age	Ht	Wt	...	Medical_Keyword_43	Medical_Keyword_44	Medical_Keyword_45	Medical_Keyword_46	Medical_Keyword_47	Medical_Keyword_48	Product_Info_2_char	Product_Info_2_num	BMI_Age	Med_Keywords_Count
616	1	10	26	0.230769	2	3	1	0.059701	0.727273	0.225941	...	0	0	0	0	0	0	1	0	0.020413	0
3239	1	0	26	0.230769	2	3	1	0.417910	0.654545	0.209205	...	0	0	0	0	0	0	0	0	0.157493	2

	1	2	3	4	5	6	7	8
1	237	161	10	22	102	307	107	295
2	144	296	7	17	161	317	95	273
3	20	19	30	38	21	58	3	14
4	16	2	4	131	0	81	5	47
5	53	91	1	0	568	226	48	99
6	94	77	0	4	86	1241	239	506
7	39	8	0	1	12	411	549	586
8	14	6	0	2	7	231	125	3513

Table of Contents