
The Allstate Corporation is an American insurance company that is in the United States. The company also has personal lines insurance operations in Canada.



import time

time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina''ggplot') 

# random state

[(x.__name__,x.__version__) for x in [np,pd,sns]]
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tools/ FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]
from scipy.special import boxcox1p
import scipy
import sklearn

print([(x.__name__,x.__version__) for x in [scipy, sklearn]])
[('scipy', '1.4.1'), ('sklearn', '0.21.3')]
# scale and split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# regressors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
# pipeline
from sklearn.pipeline import Pipeline
# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# cross validation
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
# classifier
import xgboost as xgb
# six and pickle
import six
import pickle
import joblib
%load_ext autoreload
%autoreload 2
# my personal library
from bhishan import bp

Load the data

path_pro = '../data/processed'

df_train = pd.read_csv(f'{path_pro}/train_cleaned_encoded.csv',index_col=0)

df_test = pd.read_csv(f'{path_pro}/test_cleaned_encoded.csv',index_col=0)

(188318, 816)
id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 ... dummy_cat116_KC dummy_cat116_KW dummy_cat116_LB dummy_cat116_LF dummy_cat116_LM dummy_cat116_LN dummy_cat116_LO dummy_cat116_LY dummy_cat116_MD dummy_cat116_Others
0 1 A B A B A A A A B ... 0 0 1 0 0 0 0 0 0 0
1 2 A B A A A A A A B ... 0 0 0 0 0 0 0 0 0 0

2 rows × 816 columns

df_train.shape, df_test.shape
((188318, 816), (125546, 811))
features_only_train = df_train.columns.difference(df_test.columns).tolist()

['dummy_cat101_N', 'dummy_cat101_U', 'dummy_cat102_H', 'dummy_cat102_J', 'dummy_cat105_R', 'dummy_cat105_S', 'dummy_cat111_D', 'dummy_cat114_X', 'dummy_cat89_I', 'dummy_cat90_G', 'dummy_cat92_F', 'loss', 'loss_log1p']
features_only_test = df_test.columns.difference(df_train.columns).tolist()

['dummy_cat103_M', 'dummy_cat106_Q', 'dummy_cat111_L', 'dummy_cat89_F', 'dummy_cat92_E', 'dummy_cat92_G', 'dummy_cat96_H', 'dummy_cat99_U']
features_only_one = features_only_train + features_only_test

['dummy_cat101_N', 'dummy_cat101_U', 'dummy_cat102_H', 'dummy_cat102_J', 'dummy_cat105_R', 'dummy_cat105_S', 'dummy_cat111_D', 'dummy_cat114_X', 'dummy_cat89_I', 'dummy_cat90_G', 'dummy_cat92_F', 'loss', 'loss_log1p', 'dummy_cat103_M', 'dummy_cat106_Q', 'dummy_cat111_L', 'dummy_cat89_F', 'dummy_cat92_E', 'dummy_cat92_G', 'dummy_cat96_H', 'dummy_cat99_U']

Data Preparation

continuous features: xxx_boxcox1p
categorical features: dummy_xxx

target: loss_log1p
features_raw = df_train.columns.tolist()
target = 'loss'
target_log = 'loss_log1p'

# orig
features_orig_cont = [i for i in features_raw 
                    if i.startswith('cont')
                    if not i.endswith('_boxcox1p')
features_orig_cat = [i for i in features_raw 
                    if i.startswith('cat') ]                     
features_orig = features_orig_cat + features_orig_cont

# processed
features_cont = [i for i in features_raw if i.endswith('_boxcox1p') ]

features_cat = [i for i in features_raw if i.startswith('dummy_') ]

features = features_cat + features_cont
features = [i for i in features if i not in features_only_one]

# print
# print(features_orig_cont)
# print(features_orig_cat)
# print(features_cont)
# print(features_cat)

Train-Validation Split

df_Xtr, df_Xval, ser_ytr, ser_yval = train_test_split(

Xtr = df_Xtr.to_numpy()
ytr = ser_ytr.to_numpy().ravel()
yval = ser_yval.to_numpy().ravel()

print(df_Xtr.shape, ser_ytr.shape,ytr.shape)
(150654, 672) (150654,) (150654,)
dummy_cat1_A dummy_cat1_B dummy_cat2_A dummy_cat2_B dummy_cat3_A dummy_cat3_B dummy_cat4_A dummy_cat4_B dummy_cat5_A dummy_cat5_B ... cont5_boxcox1p cont6_boxcox1p cont7_boxcox1p cont8_boxcox1p cont9_boxcox1p cont10_boxcox1p cont11_boxcox1p cont12_boxcox1p cont13_boxcox1p cont14_boxcox1p
180302 1 0 0 1 0 1 1 0 1 0 ... 0.273358 0.170764 0.278357 0.269171 0.288533 0.197228 0.169389 0.167836 0.238885 0.650623
105000 1 0 0 1 1 0 0 1 1 0 ... 0.361725 0.350364 0.320341 0.499342 0.378155 0.393653 0.310744 0.307905 0.327325 0.257707

2 rows × 672 columns

df_Xtx = df_test[features]
Xtx = df_Xtx.to_numpy()

ser_ytx = None
ytx = None

# in kaggle there is no ytest, we need to submit it.
(125546, 672)
dummy_cat1_A dummy_cat1_B dummy_cat2_A dummy_cat2_B dummy_cat3_A dummy_cat3_B dummy_cat4_A dummy_cat4_B dummy_cat5_A dummy_cat5_B ... cont5_boxcox1p cont6_boxcox1p cont7_boxcox1p cont8_boxcox1p cont9_boxcox1p cont10_boxcox1p cont11_boxcox1p cont12_boxcox1p cont13_boxcox1p cont14_boxcox1p
0 1 0 0 1 1 0 1 0 1 0 ... 0.255586 0.401870 0.285609 0.507345 0.306570 0.335532 0.333617 0.327418 0.570153 0.345239
1 1 0 0 1 1 0 0 1 1 0 ... 0.656444 0.413704 0.384638 0.576341 0.440606 0.501547 0.560053 0.551063 0.391990 0.193540

2 rows × 672 columns

Scale the data

scaler = StandardScaler()

Xtr_scaled = scaler.fit_transform(df_Xtr)
Xval_scaled = scaler.fit_transform(df_Xval)


df_Xtr_scaled = pd.DataFrame(Xtr_scaled,columns=df_Xtr.columns)

df_Xval_scaled = pd.DataFrame(Xval_scaled,columns=df_Xval.columns)

<class 'numpy.ndarray'>
dummy_cat1_A dummy_cat1_B dummy_cat2_A dummy_cat2_B dummy_cat3_A dummy_cat3_B dummy_cat4_A dummy_cat4_B dummy_cat5_A dummy_cat5_B ... cont5_boxcox1p cont6_boxcox1p cont7_boxcox1p cont8_boxcox1p cont9_boxcox1p cont10_boxcox1p cont11_boxcox1p cont12_boxcox1p cont13_boxcox1p cont14_boxcox1p
0 0.574198 -0.574198 -1.141867 1.141867 -4.159984 4.159984 0.682029 -0.682029 0.722126 -0.722126 ... -0.902536 -1.611161 -1.020371 -0.971494 -0.936657 -1.631879 -1.584026 -1.595692 -1.124155 1.446586
1 0.574198 -0.574198 -1.141867 1.141867 0.240386 -0.240386 -1.466213 1.466213 0.722126 -0.722126 ... -0.317716 -0.413701 -0.694650 0.622825 -0.246720 -0.188110 -0.668667 -0.686765 -0.555165 -0.958409

2 rows × 672 columns


Wrapper Functions

class SklearnWrapper(object):
    def __init__(self, clf, seed=100, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, Xtrain, ytrain):, np.log(ytrain))

    def predict(self, x):
        return np.exp(self.clf.predict(x))
class XgbWrapper(object):
    def __init__(self, seed=100, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, Xtrain, ytrain):
        dtrain = xgb.DMatrix(Xtrain, label=np.log(ytrain))
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return np.exp(self.gbdt.predict(xgb.DMatrix(x)))
def get_oof(clf,Xtr, ytr,
    """Out of Fold Estimation.
    clf: Wrapper class having train and predict methods.
    Xtr: Numpy array of training data
    ytr: Numpy array of training target data
    Xtx: Numpy array of test data
    ytx: Numpy array of test label data
    NFOLDS: Number of fold

    ntrain = Xtr.shape[0]
    ntest = Xtx.shape[0]

    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))

    oof_test_kf = np.empty((NFOLDS, ntest))

    kf = KFold(n_splits=NFOLDS,

    for i, (idx_tr, idx_tx) in enumerate(kf.split(Xtr,ytr)):
        xtr_now = Xtr[idx_tr]
        ytr_now = ytr[idx_tr]
        xtx_now = Xtr[idx_tx]

        clf.train(xtr_now, ytr_now)

        oof_train[idx_tx] = clf.predict(xtx_now)
        oof_test_kf[i, :] = clf.predict(Xtx)

    oof_test[:] = oof_test_kf.mean(axis=0)

    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

Modelling: Random Forest

rf_params = {
    'n_jobs': -1,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
rf = SklearnWrapper(clf=RandomForestRegressor,
rf = SklearnWrapper(clf=RandomForestRegressor,

rf_oof_train, rf_oof_test = get_oof(rf,Xtr,ytr,

print("RF-CV: {}".format(mean_absolute_error(ytr, rf_oof_train)))
RF-CV: 1292.8756371081363
CPU times: user 4min, sys: 2.64 s, total: 4min 3s
Wall time: 1min 45s

Modelling: XGBoost

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 7,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
    'nrounds': 350
xg = XgbWrapper(seed=SEED,

xg_oof_train, xg_oof_test = get_oof(xg,Xtr,ytr,

print("XG-CV: {}".format(mean_absolute_error(ytr,xg_oof_train)))
XG-CV: 1155.908488809834
CPU times: user 22min 51s, sys: 9.62 s, total: 23min 1s
Wall time: 23min 57s

Modelling: Extra Trees Regressor

et_params = {
    'n_jobs': -1,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
et = SklearnWrapper(clf=ExtraTreesRegressor,

et_oof_train, et_oof_test = get_oof(et,Xtr,ytr,

print("ET-CV: {}".format(mean_absolute_error(ytr,et_oof_train)))
ET-CV: 1243.4451960986753
CPU times: user 19min 35s, sys: 10.7 s, total: 19min 46s
Wall time: 9min 19s

Stacking multiple models

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat))
Xtr_stacked = np.c_[xg_oof_train,
Xtx_stacked = np.c_[xg_oof_test,

dtrain = xgb.DMatrix(Xtr_stacked, label=np.log(ytr))
dtest = xgb.DMatrix(Xtx)

xgb_params = {
    'seed': SEED,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',

res =,

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Stacked-CV: {0}+{1}'.format(cv_mean, cv_std))
[0]	train-mae:1522.61+1515.5	test-mae:1522.61+1515.52
[10]	train-mae:1521.41+1514.97	test-mae:1521.41+1514.99
[20]	train-mae:1519.59+1513.78	test-mae:1519.6+1513.8
[30]	train-mae:1516.86+1511.6	test-mae:1516.86+1511.62
[40]	train-mae:1512.8+1508.04	test-mae:1512.8+1508.06
[50]	train-mae:1506.96+1502.66	test-mae:1506.96+1502.68
[60]	train-mae:1498.84+1494.95	test-mae:1498.84+1494.97
[70]	train-mae:1487.95+1484.43	test-mae:1487.95+1484.45
[80]	train-mae:1473.83+1470.64	test-mae:1473.83+1470.67
[90]	train-mae:1456.1+1453.22	test-mae:1456.11+1453.25
[100]	train-mae:1434.48+1431.88	test-mae:1434.49+1431.91
[110]	train-mae:1408.86+1406.51	test-mae:1408.88+1406.55
[120]	train-mae:1379.28+1377.15	test-mae:1379.3+1377.19
[130]	train-mae:1345.88+1343.95	test-mae:1345.91+1344
[140]	train-mae:1309+1307.26	test-mae:1309.04+1307.32
[150]	train-mae:1269.11+1267.53	test-mae:1269.16+1267.6
[160]	train-mae:1226.66+1225.24	test-mae:1226.74+1225.33
[170]	train-mae:1182.4+1181.1	test-mae:1182.49+1181.21
[180]	train-mae:1136.97+1135.8	test-mae:1137.08+1135.93
[190]	train-mae:1091.14+1090.07	test-mae:1091.29+1090.24
[200]	train-mae:1045.6+1044.62	test-mae:1045.78+1044.83
[210]	train-mae:1000.99+1000.1	test-mae:1001.23+1000.35
[220]	train-mae:957.993+957.17	test-mae:958.292+957.486
[230]	train-mae:917.234+916.474	test-mae:917.596+916.851
[240]	train-mae:878.959+878.253	test-mae:879.373+878.681
[250]	train-mae:843.477+842.817	test-mae:843.981+843.336
[260]	train-mae:810.927+810.306	test-mae:811.519+810.913
[270]	train-mae:781.36+780.773	test-mae:782.046+781.473
[280]	train-mae:754.747+754.188	test-mae:755.558+755.013
[290]	train-mae:731.084+730.549	test-mae:732.007+731.486
[300]	train-mae:710.168+709.653	test-mae:711.196+710.694
[310]	train-mae:691.817+691.32	test-mae:692.933+692.447
[320]	train-mae:675.695+675.211	test-mae:676.925+676.453
[330]	train-mae:661.707+661.235	test-mae:663.038+662.577
[340]	train-mae:649.562+649.1	test-mae:650.984+650.532
[350]	train-mae:639.074+638.62	test-mae:640.593+640.148
[360]	train-mae:630.056+629.609	test-mae:631.674+631.236
[370]	train-mae:622.28+621.838	test-mae:623.984+623.551
[380]	train-mae:615.611+615.174	test-mae:617.394+616.965
[390]	train-mae:609.916+609.482	test-mae:611.766+611.342
[400]	train-mae:605.033+604.602	test-mae:606.967+606.547
[410]	train-mae:600.837+600.41	test-mae:602.838+602.42
[420]	train-mae:597.292+596.867	test-mae:599.362+598.947
[430]	train-mae:594.259+593.835	test-mae:596.402+595.989
[440]	train-mae:591.671+591.249	test-mae:593.878+593.467
[450]	train-mae:589.476+589.055	test-mae:591.748+591.337
[460]	train-mae:587.632+587.212	test-mae:589.964+589.555
[470]	train-mae:586.027+585.608	test-mae:588.421+588.013
[480]	train-mae:584.634+584.215	test-mae:587.083+586.676
[490]	train-mae:583.449+583.031	test-mae:585.947+585.541
[499]	train-mae:582.531+582.113	test-mae:585.066+584.66
Stacked-CV: 582.5307191249999+582.1132548840304

Model Estimation on Test Set

# dtrain = xgb.DMatrix(Xtr_stacked, label=np.log(ytr))
# dtest = xgb.DMatrix(Xtx)

gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

output = np.exp(gbdt.predict(dtest))

Time Taken

time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))