In this project we will use multiclass classification to predict one of the 8 possible value of Response.
The data is taken from Kaggle Prudential Life Insurance Project.
About only 40% household in USA has life insurance policy. Based on different of applicant 8 different quotes are granted to applicants.
Here category 8 has the highest counts, I assume it the quote that is granted.
Records: 60k
Features: 127
Target: Response (has 8 categories, 1-8)
Features:
1 Misc : Age ht wt bmi 4
2 Product Info : Product_Info_1 to 7 7
3 Employment Info : Employment_Info_1 to 6 6
4 Insured Info : InsuredInfo_1 to 7 7
5 Insurance History: Insurance_History_1 to 9 9
6 Family History : Family_Hist_1 to 5 5
7 Medical History : Medical_History_1 to 41 41
8 Medical Keywords : Medical_Keyword_1 to 48 48
Target: Response 1
ID : ID 1
---------------------------------------------------
Total Features: 127
Dependent Variable: 1 (Response)
Method Used:
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
!pip install watermark
!pip install -U lrcurve
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import time
import numpy as np
import pandas as pd
import seaborn as sns
import os
import json
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# random state
SEED=100
time_start_notebook = time.time()
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import ColumnSelector
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
%load_ext watermark
%watermark -a "Bhishan Poudel" -dvm
%watermark -iv
metrics.plot_roc_curve
only works for classifier, does not work for pipeline object.def data_cleaning():
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
df = df.copy()
columns_to_drop = ['Id', 'Medical_History_10','Medical_History_24']
df = df.drop(columns_to_drop,axis=1)
df['Product_Info_2_char'] = df.Product_Info_2.str[0]
df['Product_Info_2_num'] = df.Product_Info_2.str[1]
# factorize categorical variables
df['Product_Info_2'] = pd.factorize(df['Product_Info_2'])[0]
df['Product_Info_2_char'] = pd.factorize(df['Product_Info_2_char'])[0]
df['Product_Info_2_num'] = pd.factorize(df['Product_Info_2_num'])[0]
df['BMI_Age'] = df['BMI'] * df['Ins_Age']
med_keyword_columns = df.columns[df.columns.str.startswith('Medical_Keyword_')]
df['Med_Keywords_Count'] = df[med_keyword_columns].sum(axis=1)
df = df.fillna(-1)
return df
df = data_cleaning()
print(df.shape)
df.isna().sum().sum(), df.sum().sum()
# make sure all values are number and there are no nans
df.sum().sum(), df.isna().sum().sum()
from sklearn.model_selection import train_test_split
target = 'Response'
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
df.drop(target,axis=1), df[target],
test_size=0.2, random_state=SEED, stratify=df[target])
ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
print(f"df : {df.shape}")
print(f"\ndf_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}")
print(f"\ndf_Xtest : {df_Xtest.shape}")
print(f"ser_ytest : {ser_ytest.shape}")
df_Xtrain.head(2)
def weighted_quadratic_kappa_score(y_true,y_probs):
y_preds = np.digitize(y_probs,range(1,9))
score = metrics.cohen_kappa_score(y_true,y_preds,weights='quadratic')
return score
weighted_quadratic_kappa_scorer = make_scorer(
weighted_quadratic_kappa_score, greater_is_better=True)
from sklearn.model_selection import validation_curve
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0,n_jobs=-1,random_state=SEED)
clf
%%time
skf = StratifiedKFold(n_splits=10, random_state=SEED,shuffle=True)
train_scores, valid_scores = validation_curve(clf, df_Xtrain, ytrain,
"C", np.logspace(-7, 1, 5), # C = 1e-7 etc
cv=skf,n_jobs=-1,
scoring=weighted_quadratic_kappa_scorer)
kappa = train_scores.mean()
print(f'mean kappa for training: {kappa}')
train_loss = train_scores[-1]
valid_loss = valid_scores[-1]
train_loss,valid_loss
try:
from lrcurve import PlotLearningCurve
with PlotLearningCurve() as plot:
for i in range(len(train_loss)):
plot.append(i, {
'loss': train_loss[i],
'val_loss': valid_loss[i]
})
plot.draw()
time.sleep(0.5)
except:
pass # tensorflow not installed.
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
model_names = []
cv_scores_mean = []
cv_scores_std = []
clf_lr = LogisticRegression(solver='lbfgs',
max_iter=1000,
random_state=SEED,
verbose=0,
n_jobs=-1) # for liblinear n_jobs is +1.
skf = StratifiedKFold(n_splits=3, random_state=SEED,shuffle=True)
t0 = time.time()
clf = clf_lr
clf.fit(df_Xtrain, ytrain)
score = cross_val_score(clf, df_Xtrain, ytrain, cv=skf,n_jobs=-1,verbose=0,
scoring=weighted_quadratic_kappa_scorer)
score_mean = score.mean()
score_std = score.std()
model_names.append('Logistic Regression, max_iter=1000')
cv_scores_mean.append(score_mean)
cv_scores_std.append(score_std)
print(f'score mean = {score_mean}')
print(f'score std = {score_std}')
t1 = (time.time() - t0)
print('Time taken: {} minutes {:.2f} seconds'.format(*divmod(t1,60)))
clf_rf = RandomForestClassifier(n_estimators=100,verbose=0,
random_state=SEED,n_jobs=-1)
skf = StratifiedKFold(n_splits=3, random_state=SEED,shuffle=True)
t0 = time.time()
clf_rf.fit(df_Xtrain, ytrain)
score = cross_val_score(clf_rf, df_Xtrain, ytrain, cv=skf,n_jobs=-1,verbose=1,
scoring=weighted_quadratic_kappa_scorer)
score_mean = score.mean()
score_std = score.std()
model_names.append('Random Forest, n_estimators=100')
cv_scores_mean.append(score_mean)
cv_scores_std.append(score_std)
print(f'score mean = {score_mean}')
print(f'score std = {score_std}')
t1 = (time.time() - t0)
print('Time taken: {} minutes {:.2f} seconds'.format(*divmod(t1,60)))
%%time
ypreds = clf_rf.predict(df_Xtest)
print(ypreds[:5])
score = metrics.cohen_kappa_score(ytest,ypreds,weights='quadratic')
print(f'Weighted quadratic kappa = {score}')
cm = metrics.confusion_matrix(ytest,ypreds)
print(cm)
df_cm = pd.DataFrame(cm, index=range(1,9),columns=range(1,9))
df_cm.style.background_gradient().set_caption('Predicted Label')
disp = metrics.plot_confusion_matrix(clf_rf, df_Xtest, ytest,
display_labels=[f'Class {i+1}' for i in range(8)],
values_format='d',
cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix')
plt.show()
from sklearn.metrics import classification_report
target_names = [f'class {i+1}' for i in range(8)]
print(classification_report(ytest, ypreds, target_names=target_names))
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))