In this project, we will predict the probability that an auto insurance policy holder files a claim. This a binary classification problem.
We have more than half a million records and 59 features (including already calculated features).
binary features: _bin
categorical features: _cat
continuous or ordinal feafures: ind, reg, car, calc
missing values: -1
Fullforms
ind = individual
reg = registration
car = car
calc = calculated
The target columns signifies whether or not a claim was filed for that policy holder.
From this graph of wikipedia G = A / (A+B)
. Gini index varies between 0 and 1. Here we have only binary options: rich and poor.
x-axis= number of people (cumulative sum)
y-axis = total income (cumulative sum)
0 = complete equality of richness
1 = complete inequality of richness
This competition
0 = random guessing
1 = maximum score (also remember 2*1-1 = 1 when maximum auc is 1).
If we calculate gini from gini = 2*auc -1
it has range (-1,1)
.
For AUC:
worst binary classifier AUC = 0.5
perfect binary classifier AUC = 1
If AUC is less than below, simply simply invert 0 <==> 1 then we will get roc auc score between 0.5 and 1.0
import os
import time
import gc
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
from pprint import pprint
%matplotlib inline
time_start_notebook = time.time()
SEED=100
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# Google colab
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# extra modules
!pip install rgf_python # regularized greedy forest
!pip install catboost
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
from catboost import CatBoostClassifier
# Regularized Greedy Forest
from rgf.sklearn import RGFClassifier # https://github.com/fukatani/rgf_python
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
'NormalizedGini': []
})
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/'
'Porto_seguro_safe_driver_prediction/train.csv.zip?raw=true',compression='zip')
print(df.shape)
# faster runtime
# df = df.sample(frac=0.01,random_state=SEED)
df.head()
"""
Comment about file size:
The data is large, it has 595k records and 59 features.
ps = porto seguro
_bin = binary feature
_cat = categorical feature
continuous or ordinal: ind, reg, car, calc
""";
target = 'target'
# all features except target
cols_all= df.columns.drop(target).to_list()
# categorical features except later created count
cols_cat = [c for c in cols_all if ('cat' in c and 'count' not in c)]
# we exclude calc features in numeric features
cols_num = [c for c in cols_all if ('cat' not in c and 'calc' not in c)]
print(cols_num)
# ohe
df = pd.get_dummies(df,columns=cols_cat,drop_first=True)
from sklearn.model_selection import train_test_split
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
df.drop(target,axis=1),df[target],
test_size=0.2,random_state=SEED, stratify=df[target])
# backup and delete id
cols_drop = ['id']
train_id = df_Xtrain[cols_drop]
test_id = df_Xtest[cols_drop]
df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)
Xtrain = df_Xtrain.to_numpy()
ytrain = ser_ytrain.to_numpy().ravel()
Xtest = df_Xtest.to_numpy()
ytest = ser_ytest.to_numpy().ravel()
# make sure no nans and no strings
print(Xtrain.sum().sum())
pd.set_option('display.max_columns',250)
df_Xtrain.head()
# df_Xtrain.columns # make sure there are no id and index
Xtr = Xtrain
Xtx = Xtest
ytr = ytrain
ytx = ytest
print(Xtr.shape, Xtx.shape)
ser_ytest.value_counts(normalize=True)
#gini scoring function from kernel at:
#https://www.kaggle.com/tezdhar/faster-gini-calculation
def ginic(actual, pred):
n = len(actual)
a_s = actual[np.argsort(pred)]
a_c = a_s.cumsum()
giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
return giniSum / n
def gini_normalizedc(a, p):
return ginic(a, p) / ginic(a, a)
# remove calc features
cols_use = [c for c in df_Xtrain.columns if (not c.startswith('ps_calc_'))]
df_Xtrain = df_Xtrain[cols_use]
df_Xtest = df_Xtest[cols_use]
class Ensemble():
def __init__(self, n_splits, stacker, base_models, model_names):
self.n_splits = n_splits
self.stacker = stacker
self.base_models = base_models
self.model_names = model_names
def fit_predict(self, X, y, T):
X = np.array(X)
y = np.array(y)
T = np.array(T) # test
skf = StratifiedKFold(n_splits=self.n_splits,
shuffle=True, random_state=SEED)
folds = list(skf.split(X, y)) # we need to make list
# stack outputs (ncolumns = len of models)
S_train = np.zeros((X.shape[0], len(self.base_models)))
S_test = np.zeros((T.shape[0], len(self.base_models)))
model_names = self.model_names
time_start = time.time()
for i, clf in enumerate(self.base_models):
print('Model: ', model_names[i])
# init test output for this model
S_test_i = np.zeros((T.shape[0], self.n_splits))
for j, (train_idx, test_idx) in enumerate(folds):
X_train = X[train_idx]
y_train = y[train_idx]
X_holdout = X[test_idx]
print (f" Fold {j+1}")
clf.fit(X_train, y_train)
# cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
# print(" cv AUC: %.5f" % (cross_score.mean()))
y_prob = clf.predict_proba(X_holdout)[:,1]
S_train[test_idx, i] = y_prob
S_test_i[:, j] = clf.predict_proba(T)[:,1]
# time taken
time_taken = time.time() - time_start
h,m = divmod(time_taken,60*60)
print(' Time taken : {:.0f} hr '\
'{:.0f} min {:.0f} secs\n'.format(h, *divmod(m,60)))
S_test[:, i] = S_test_i.mean(axis=1)
results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
print("Stacker AUC: %.5f" % (results.mean()))
self.stacker.fit(S_train, y)
res = self.stacker.predict_proba(S_test)[:,1]
return res
# LightGBM params
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8
lgb_params['min_child_samples'] = 500
lgb_params['seed'] = SEED
lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['seed'] = SEED
lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['seed'] = SEED
# RandomForest params
#rf_params = {}
#rf_params['n_estimators'] = 200
#rf_params['max_depth'] = 6
#rf_params['min_samples_split'] = 70
#rf_params['min_samples_leaf'] = 30
#rf_params['random_state'] = SEED
# ExtraTrees params
#et_params = {}
#et_params['n_estimators'] = 155
#et_params['max_features'] = 0.3
#et_params['max_depth'] = 6
#et_params['min_samples_split'] = 40
#et_params['min_samples_leaf'] = 18
#et_params['random_state'] = SEED
# XGBoost params
#xgb_params = {}
#xgb_params['objective'] = 'binary:logistic'
#xgb_params['learning_rate'] = 0.04
#xgb_params['n_estimators'] = 490
#xgb_params['max_depth'] = 4
#xgb_params['subsample'] = 0.9
#xgb_params['colsample_bytree'] = 0.9
#xgb_params['min_child_weight'] = 10
#xgb_params['random_state'] = SEED
# CatBoost params
#cat_params = {}
#cat_params['iterations'] = 900
#cat_params['depth'] = 8
#cat_params['rsm'] = 0.95
#cat_params['learning_rate'] = 0.03
#cat_params['l2_leaf_reg'] = 3.5
#cat_params['border_count'] = 8
#cat_params['gradient_iterations'] = 4
#cat_params['random_state'] = SEED
# Regularized Greedy Forest params
#rgf_params = {}
#rgf_params['max_leaf'] = 2000
#rgf_params['learning_rate'] = 0.5
#rgf_params['algorithm'] = "RGF_Sib"
#rgf_params['test_interval'] = 100
#rgf_params['min_samples_leaf'] = 3
#rgf_params['reg_depth'] = 1.0
#rgf_params['l2'] = 0.5
#rgf_params['sl2'] = 0.005
lgb_model = LGBMClassifier(**lgb_params)
lgb_model2 = LGBMClassifier(**lgb_params2)
lgb_model3 = LGBMClassifier(**lgb_params3)
#rf_model = RandomForestClassifier(**rf_params)
#et_model = ExtraTreesClassifier(**et_params)
#xgb_model = XGBClassifier(**xgb_params)
#cat_model = CatBoostClassifier(**cat_params)
#rgf_model = RGFClassifier(**rgf_params)
#gb_model = GradientBoostingClassifier(max_depth=5)
#ada_model = AdaBoostClassifier()
log_model = LogisticRegression()
model_names = ['lgb1','lgb2','lgb3']
base_models = [lgb_model, lgb_model2, lgb_model3]
stack = Ensemble(n_splits=3,
stacker = log_model,
base_models = base_models,
model_names = model_names
)
yprobs = stack.fit_predict(df_Xtrain, ser_ytrain, df_Xtest)
score = gini_normalizedc(ser_ytest.to_numpy(), yprobs)
print('normalized gini score ', score)
df_sub = pd.DataFrame({'id': test_id.to_numpy().ravel(),
'target': yprobs})
df_sub.head()
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))