Description¶

The Allstate Corporation is an American insurance company that is in the United States. The company also has personal lines insurance operations in Canada.

References

Imports¶

import time

time_start_notebook = time.time()

import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 

# random state
SEED=100

[(x.__name__,x.__version__) for x in [np,pd,sns]]

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]

from scipy.special import boxcox1p

import scipy
import sklearn

print([(x.__name__,x.__version__) for x in [scipy, sklearn]])

[('scipy', '1.4.1'), ('sklearn', '0.21.3')]

# scale and split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# regressors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

# pipeline
from sklearn.pipeline import Pipeline

# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# cross validation
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

# classifier
import xgboost as xgb

# six and pickle
import six
import pickle
import joblib

%load_ext autoreload

%autoreload 2

# my personal library
from bhishan import bp

Load the data¶

path_pro = '../data/processed'

df_train = pd.read_csv(f'{path_pro}/train_cleaned_encoded.csv',index_col=0)

df_test = pd.read_csv(f'{path_pro}/test_cleaned_encoded.csv',index_col=0)

print(df_train.shape)
df_train.head(2)

(188318, 816)

df_train.shape, df_test.shape

((188318, 816), (125546, 811))

features_only_train = df_train.columns.difference(df_test.columns).tolist()

print(features_only_train)

['dummy_cat101_N', 'dummy_cat101_U', 'dummy_cat102_H', 'dummy_cat102_J', 'dummy_cat105_R', 'dummy_cat105_S', 'dummy_cat111_D', 'dummy_cat114_X', 'dummy_cat89_I', 'dummy_cat90_G', 'dummy_cat92_F', 'loss', 'loss_log1p']

features_only_test = df_test.columns.difference(df_train.columns).tolist()

print(features_only_test)

['dummy_cat103_M', 'dummy_cat106_Q', 'dummy_cat111_L', 'dummy_cat89_F', 'dummy_cat92_E', 'dummy_cat92_G', 'dummy_cat96_H', 'dummy_cat99_U']

features_only_one = features_only_train + features_only_test

print(features_only_one)

['dummy_cat101_N', 'dummy_cat101_U', 'dummy_cat102_H', 'dummy_cat102_J', 'dummy_cat105_R', 'dummy_cat105_S', 'dummy_cat111_D', 'dummy_cat114_X', 'dummy_cat89_I', 'dummy_cat90_G', 'dummy_cat92_F', 'loss', 'loss_log1p', 'dummy_cat103_M', 'dummy_cat106_Q', 'dummy_cat111_L', 'dummy_cat89_F', 'dummy_cat92_E', 'dummy_cat92_G', 'dummy_cat96_H', 'dummy_cat99_U']

Data Preparation¶

"""
continuous features: xxx_boxcox1p
categorical features: dummy_xxx

target: loss_log1p
""";

features_raw = df_train.columns.tolist()
target = 'loss'
target_log = 'loss_log1p'

# orig
features_orig_cont = [i for i in features_raw 
                    if i.startswith('cont')
                    if not i.endswith('_boxcox1p')
                     ]
                      
features_orig_cat = [i for i in features_raw 
                    if i.startswith('cat') ]                     
features_orig = features_orig_cat + features_orig_cont

# processed
features_cont = [i for i in features_raw if i.endswith('_boxcox1p') ]

features_cat = [i for i in features_raw if i.startswith('dummy_') ]

features = features_cat + features_cont
features = [i for i in features if i not in features_only_one]


# print
 
# print(features_orig_cont)
# print(features_orig_cat)
# print(features_cont)
# print(features_cat)

Train-Validation Split¶

df_Xtr, df_Xval, ser_ytr, ser_yval = train_test_split(
    df_train[features],df_train[target],
    train_size=0.8,random_state=SEED)

Xtr = df_Xtr.to_numpy()
ytr = ser_ytr.to_numpy().ravel()
yval = ser_yval.to_numpy().ravel()

print(df_Xtr.shape, ser_ytr.shape,ytr.shape)
df_Xtr.head(2)

(150654, 672) (150654,) (150654,)

df_Xtx = df_test[features]
Xtx = df_Xtx.to_numpy()

ser_ytx = None
ytx = None

# in kaggle there is no ytest, we need to submit it.
print(df_Xtx.shape)
df_Xtx.head(2)

(125546, 672)

Scale the data¶

scaler = StandardScaler()

Xtr_scaled = scaler.fit_transform(df_Xtr)
Xval_scaled = scaler.fit_transform(df_Xval)

print(type(Xtr_scaled))

df_Xtr_scaled = pd.DataFrame(Xtr_scaled,columns=df_Xtr.columns)

df_Xval_scaled = pd.DataFrame(Xval_scaled,columns=df_Xval.columns)

df_Xtr_scaled.head(2)

<class 'numpy.ndarray'>

Modelling¶

Wrapper Functions¶

class SklearnWrapper(object):
    def __init__(self, clf, seed=100, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, Xtrain, ytrain):
        self.clf.fit(Xtrain, np.log(ytrain))

    def predict(self, x):
        return np.exp(self.clf.predict(x))

class XgbWrapper(object):
    def __init__(self, seed=100, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, Xtrain, ytrain):
        dtrain = xgb.DMatrix(Xtrain, label=np.log(ytrain))
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return np.exp(self.gbdt.predict(xgb.DMatrix(x)))

def get_oof(clf,Xtr, ytr,
            Xtx,ytx,
            NFOLDS):
    """Out of Fold Estimation.
    
    Parameters:
    ------------
    clf: Wrapper class having train and predict methods.
    Xtr: Numpy array of training data
    ytr: Numpy array of training target data
    Xtx: Numpy array of test data
    ytx: Numpy array of test label data
    NFOLDS: Number of fold

    """
    ntrain = Xtr.shape[0]
    ntest = Xtx.shape[0]

    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))

    oof_test_kf = np.empty((NFOLDS, ntest))

    kf = KFold(n_splits=NFOLDS,
                          shuffle=True,
                          random_state=SEED)

    for i, (idx_tr, idx_tx) in enumerate(kf.split(Xtr,ytr)):
        xtr_now = Xtr[idx_tr]
        ytr_now = ytr[idx_tr]
        xtx_now = Xtr[idx_tx]

        clf.train(xtr_now, ytr_now)

        oof_train[idx_tx] = clf.predict(xtx_now)
        oof_test_kf[i, :] = clf.predict(Xtx)

    oof_test[:] = oof_test_kf.mean(axis=0)

    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

Modelling: Random Forest¶

rf_params = {
    'n_jobs': -1,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
}

rf = SklearnWrapper(clf=RandomForestRegressor,
                    seed=SEED,
                    params=rf_params)

%%time

NFOLDS = 2

rf = SklearnWrapper(clf=RandomForestRegressor,
                    seed=SEED,
                    params=rf_params)

rf_oof_train, rf_oof_test = get_oof(rf,Xtr,ytr,
                                    Xtx,ytx,NFOLDS)

print("RF-CV: {}".format(mean_absolute_error(ytr, rf_oof_train)))

RF-CV: 1292.8756371081363
CPU times: user 4min, sys: 2.64 s, total: 4min 3s
Wall time: 1min 45s

Modelling: XGBoost¶

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 7,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
    'nrounds': 350
}

%%time

xg = XgbWrapper(seed=SEED,
                params=xgb_params)

xg_oof_train, xg_oof_test = get_oof(xg,Xtr,ytr,
                                    Xtx,ytx,NFOLDS)

print("XG-CV: {}".format(mean_absolute_error(ytr,xg_oof_train)))

XG-CV: 1155.908488809834
CPU times: user 22min 51s, sys: 9.62 s, total: 23min 1s
Wall time: 23min 57s

Modelling: Extra Trees Regressor¶

et_params = {
    'n_jobs': -1,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

%%time

et = SklearnWrapper(clf=ExtraTreesRegressor,
                    seed=SEED,
                    params=et_params)

et_oof_train, et_oof_test = get_oof(et,Xtr,ytr,
                                    Xtx,ytx,NFOLDS)

print("ET-CV: {}".format(mean_absolute_error(ytr,et_oof_train)))

ET-CV: 1243.4451960986753
CPU times: user 19min 35s, sys: 10.7 s, total: 19min 46s
Wall time: 9min 19s

Stacking multiple models¶

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat))

Xtr_stacked = np.c_[xg_oof_train,
                    et_oof_train,
                    rf_oof_train]
Xtx_stacked = np.c_[xg_oof_test,
                    et_oof_test,
                    rf_oof_test]

dtrain = xgb.DMatrix(Xtr_stacked, label=np.log(ytr))
dtest = xgb.DMatrix(Xtx)

xgb_params = {
    'seed': SEED,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
}

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=500,
             nfold=4,
             seed=SEED,
             stratified=False,
             early_stopping_rounds=25,
             verbose_eval=10,
             show_stdv=True,
             feval=xg_eval_mae,
             maximize=False)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Stacked-CV: {0}+{1}'.format(cv_mean, cv_std))

[0]	train-mae:1522.61+1515.5	test-mae:1522.61+1515.52
[10]	train-mae:1521.41+1514.97	test-mae:1521.41+1514.99
[20]	train-mae:1519.59+1513.78	test-mae:1519.6+1513.8
[30]	train-mae:1516.86+1511.6	test-mae:1516.86+1511.62
[40]	train-mae:1512.8+1508.04	test-mae:1512.8+1508.06
[50]	train-mae:1506.96+1502.66	test-mae:1506.96+1502.68
[60]	train-mae:1498.84+1494.95	test-mae:1498.84+1494.97
[70]	train-mae:1487.95+1484.43	test-mae:1487.95+1484.45
[80]	train-mae:1473.83+1470.64	test-mae:1473.83+1470.67
[90]	train-mae:1456.1+1453.22	test-mae:1456.11+1453.25
[100]	train-mae:1434.48+1431.88	test-mae:1434.49+1431.91
[110]	train-mae:1408.86+1406.51	test-mae:1408.88+1406.55
[120]	train-mae:1379.28+1377.15	test-mae:1379.3+1377.19
[130]	train-mae:1345.88+1343.95	test-mae:1345.91+1344
[140]	train-mae:1309+1307.26	test-mae:1309.04+1307.32
[150]	train-mae:1269.11+1267.53	test-mae:1269.16+1267.6
[160]	train-mae:1226.66+1225.24	test-mae:1226.74+1225.33
[170]	train-mae:1182.4+1181.1	test-mae:1182.49+1181.21
[180]	train-mae:1136.97+1135.8	test-mae:1137.08+1135.93
[190]	train-mae:1091.14+1090.07	test-mae:1091.29+1090.24
[200]	train-mae:1045.6+1044.62	test-mae:1045.78+1044.83
[210]	train-mae:1000.99+1000.1	test-mae:1001.23+1000.35
[220]	train-mae:957.993+957.17	test-mae:958.292+957.486
[230]	train-mae:917.234+916.474	test-mae:917.596+916.851
[240]	train-mae:878.959+878.253	test-mae:879.373+878.681
[250]	train-mae:843.477+842.817	test-mae:843.981+843.336
[260]	train-mae:810.927+810.306	test-mae:811.519+810.913
[270]	train-mae:781.36+780.773	test-mae:782.046+781.473
[280]	train-mae:754.747+754.188	test-mae:755.558+755.013
[290]	train-mae:731.084+730.549	test-mae:732.007+731.486
[300]	train-mae:710.168+709.653	test-mae:711.196+710.694
[310]	train-mae:691.817+691.32	test-mae:692.933+692.447
[320]	train-mae:675.695+675.211	test-mae:676.925+676.453
[330]	train-mae:661.707+661.235	test-mae:663.038+662.577
[340]	train-mae:649.562+649.1	test-mae:650.984+650.532
[350]	train-mae:639.074+638.62	test-mae:640.593+640.148
[360]	train-mae:630.056+629.609	test-mae:631.674+631.236
[370]	train-mae:622.28+621.838	test-mae:623.984+623.551
[380]	train-mae:615.611+615.174	test-mae:617.394+616.965
[390]	train-mae:609.916+609.482	test-mae:611.766+611.342
[400]	train-mae:605.033+604.602	test-mae:606.967+606.547
[410]	train-mae:600.837+600.41	test-mae:602.838+602.42
[420]	train-mae:597.292+596.867	test-mae:599.362+598.947
[430]	train-mae:594.259+593.835	test-mae:596.402+595.989
[440]	train-mae:591.671+591.249	test-mae:593.878+593.467
[450]	train-mae:589.476+589.055	test-mae:591.748+591.337
[460]	train-mae:587.632+587.212	test-mae:589.964+589.555
[470]	train-mae:586.027+585.608	test-mae:588.421+588.013
[480]	train-mae:584.634+584.215	test-mae:587.083+586.676
[490]	train-mae:583.449+583.031	test-mae:585.947+585.541
[499]	train-mae:582.531+582.113	test-mae:585.066+584.66
Stacked-CV: 582.5307191249999+582.1132548840304

Model Estimation on Test Set¶

# dtrain = xgb.DMatrix(Xtr_stacked, label=np.log(ytr))
# dtest = xgb.DMatrix(Xtx)

gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

output = np.exp(gbdt.predict(dtest))

output[:5]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-40-0b13a686a22c> in <module>
      4 gbdt = xgb.train(xgb_params, dtrain, best_nrounds)
      5 
----> 6 output = np.exp(gbdt.predict(dtest))
      7 
      8 output[:5]

~/miniconda3/envs/dataSc/lib/python3.7/site-packages/xgboost/core.py in predict(self, data, output_margin, ntree_limit, pred_leaf, pred_contribs, approx_contribs, pred_interactions, validate_features)
   1192 
   1193         if validate_features:
-> 1194             self._validate_features(data)
   1195 
   1196         length = c_bst_ulong()

~/miniconda3/envs/dataSc/lib/python3.7/site-packages/xgboost/core.py in _validate_features(self, data)
   1475 
   1476                 raise ValueError(msg.format(self.feature_names,
-> 1477                                             data.feature_names))
   1478 
   1479     def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True):

ValueError: feature_names mismatch: ['f0', 'f1', 'f2'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305', 'f306', 'f307', 'f308', 'f309', 'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318', 'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327', 'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336', 'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345', 'f346', 'f347', 'f348', 'f349', 'f350', 'f351', 'f352', 'f353', 'f354', 'f355', 'f356', 'f357', 'f358', 'f359', 'f360', 'f361', 'f362', 'f363', 'f364', 'f365', 'f366', 'f367', 'f368', 'f369', 'f370', 'f371', 'f372', 'f373', 'f374', 'f375', 'f376', 'f377', 'f378', 'f379', 'f380', 'f381', 'f382', 'f383', 'f384', 'f385', 'f386', 'f387', 'f388', 'f389', 'f390', 'f391', 'f392', 'f393', 'f394', 'f395', 'f396', 'f397', 'f398', 'f399', 'f400', 'f401', 'f402', 'f403', 'f404', 'f405', 'f406', 'f407', 'f408', 'f409', 'f410', 'f411', 'f412', 'f413', 'f414', 'f415', 'f416', 'f417', 'f418', 'f419', 'f420', 'f421', 'f422', 'f423', 'f424', 'f425', 'f426', 'f427', 'f428', 'f429', 'f430', 'f431', 'f432', 'f433', 'f434', 'f435', 'f436', 'f437', 'f438', 'f439', 'f440', 'f441', 'f442', 'f443', 'f444', 'f445', 'f446', 'f447', 'f448', 'f449', 'f450', 'f451', 'f452', 'f453', 'f454', 'f455', 'f456', 'f457', 'f458', 'f459', 'f460', 'f461', 'f462', 'f463', 'f464', 'f465', 'f466', 'f467', 'f468', 'f469', 'f470', 'f471', 'f472', 'f473', 'f474', 'f475', 'f476', 'f477', 'f478', 'f479', 'f480', 'f481', 'f482', 'f483', 'f484', 'f485', 'f486', 'f487', 'f488', 'f489', 'f490', 'f491', 'f492', 'f493', 'f494', 'f495', 'f496', 'f497', 'f498', 'f499', 'f500', 'f501', 'f502', 'f503', 'f504', 'f505', 'f506', 'f507', 'f508', 'f509', 'f510', 'f511', 'f512', 'f513', 'f514', 'f515', 'f516', 'f517', 'f518', 'f519', 'f520', 'f521', 'f522', 'f523', 'f524', 'f525', 'f526', 'f527', 'f528', 'f529', 'f530', 'f531', 'f532', 'f533', 'f534', 'f535', 'f536', 'f537', 'f538', 'f539', 'f540', 'f541', 'f542', 'f543', 'f544', 'f545', 'f546', 'f547', 'f548', 'f549', 'f550', 'f551', 'f552', 'f553', 'f554', 'f555', 'f556', 'f557', 'f558', 'f559', 'f560', 'f561', 'f562', 'f563', 'f564', 'f565', 'f566', 'f567', 'f568', 'f569', 'f570', 'f571', 'f572', 'f573', 'f574', 'f575', 'f576', 'f577', 'f578', 'f579', 'f580', 'f581', 'f582', 'f583', 'f584', 'f585', 'f586', 'f587', 'f588', 'f589', 'f590', 'f591', 'f592', 'f593', 'f594', 'f595', 'f596', 'f597', 'f598', 'f599', 'f600', 'f601', 'f602', 'f603', 'f604', 'f605', 'f606', 'f607', 'f608', 'f609', 'f610', 'f611', 'f612', 'f613', 'f614', 'f615', 'f616', 'f617', 'f618', 'f619', 'f620', 'f621', 'f622', 'f623', 'f624', 'f625', 'f626', 'f627', 'f628', 'f629', 'f630', 'f631', 'f632', 'f633', 'f634', 'f635', 'f636', 'f637', 'f638', 'f639', 'f640', 'f641', 'f642', 'f643', 'f644', 'f645', 'f646', 'f647', 'f648', 'f649', 'f650', 'f651', 'f652', 'f653', 'f654', 'f655', 'f656', 'f657', 'f658', 'f659', 'f660', 'f661', 'f662', 'f663', 'f664', 'f665', 'f666', 'f667', 'f668', 'f669', 'f670', 'f671']
training data did not have the following fields: f611, f198, f493, f593, f417, f522, f138, f579, f670, f607, f510, f467, f272, f170, f520, f430, f450, f128, f492, f75, f5, f639, f514, f526, f521, f651, f538, f655, f424, f570, f34, f16, f186, f581, f361, f32, f466, f491, f461, f344, f261, f273, f589, f279, f355, f207, f281, f587, f559, f389, f396, f321, f653, f489, f318, f45, f475, f562, f26, f12, f599, f7, f51, f145, f598, f604, f632, f221, f66, f253, f262, f337, f478, f155, f667, f551, f77, f330, f505, f338, f335, f634, f275, f293, f429, f391, f660, f247, f113, f167, f494, f568, f555, f434, f550, f206, f452, f24, f21, f554, f663, f433, f233, f479, f561, f455, f404, f548, f90, f286, f10, f208, f38, f397, f60, f312, f409, f531, f584, f588, f603, f241, f418, f610, f519, f432, f512, f517, f449, f523, f80, f569, f171, f462, f6, f571, f254, f25, f215, f425, f468, f304, f59, f564, f227, f266, f502, f190, f91, f118, f393, f218, f469, f72, f364, f230, f130, f315, f143, f665, f71, f583, f67, f238, f97, f175, f537, f129, f392, f377, f620, f73, f567, f203, f147, f436, f252, f375, f532, f288, f614, f193, f360, f296, f640, f108, f654, f276, f367, f274, f427, f637, f211, f419, f664, f399, f69, f636, f104, f295, f19, f224, f50, f326, f202, f473, f464, f623, f572, f94, f246, f343, f435, f236, f366, f546, f58, f650, f341, f83, f249, f438, f486, f560, f381, f402, f332, f445, f448, f483, f585, f30, f13, f540, f592, f280, f243, f311, f474, f187, f127, f183, f264, f410, f342, f199, f135, f298, f319, f353, f415, f661, f258, f539, f616, f134, f439, f99, f463, f112, f244, f374, f322, f359, f513, f518, f220, f411, f204, f384, f503, f440, f619, f524, f357, f351, f4, f500, f368, f638, f70, f363, f490, f188, f245, f644, f3, f313, f76, f115, f329, f495, f602, f205, f287, f635, f214, f379, f626, f84, f596, f300, f105, f140, f42, f54, f35, f422, f271, f174, f442, f43, f18, f595, f44, f317, f228, f507, f63, f173, f53, f195, f625, f46, f388, f629, f20, f400, f178, f482, f553, f671, f185, f285, f292, f515, f372, f169, f460, f580, f488, f428, f268, f446, f283, f574, f122, f278, f365, f40, f302, f403, f441, f212, f582, f109, f110, f191, f487, f307, f552, f119, f648, f86, f324, f219, f172, f575, f226, f346, f345, f117, f146, f378, f234, f444, f401, f456, f111, f624, f31, f216, f61, f305, f163, f471, f284, f412, f168, f36, f240, f416, f457, f235, f267, f78, f383, f621, f630, f356, f56, f57, f476, f576, f642, f33, f645, f136, f669, f472, f496, f331, f534, f470, f87, f387, f93, f516, f549, f107, f11, f282, f557, f498, f573, f189, f497, f156, f85, f350, f443, f52, f320, f647, f376, f529, f373, f506, f270, f150, f615, f333, f477, f499, f159, f257, f100, f179, f530, f68, f451, f386, f49, f92, f39, f65, f336, f123, f201, f209, f421, f541, f299, f197, f132, f176, f213, f47, f101, f414, f558, f618, f269, f652, f125, f334, f184, f405, f542, f308, f622, f641, f106, f628, f291, f316, f325, f658, f323, f154, f597, f349, f525, f15, f633, f423, f102, f649, f177, f643, f37, f181, f164, f533, f594, f131, f98, f23, f556, f242, f137, f426, f74, f431, f29, f394, f511, f210, f237, f407, f116, f229, f277, f256, f398, f265, f82, f656, f27, f395, f194, f501, f340, f148, f380, f255, f290, f223, f141, f627, f309, f22, f590, f114, f250, f200, f294, f408, f225, f347, f142, f382, f631, f547, f259, f162, f314, f180, f126, f289, f536, f157, f303, f121, f151, f166, f9, f465, f566, f606, f81, f96, f251, f528, f563, f306, f133, f297, f371, f600, f362, f152, f153, f657, f659, f543, f454, f222, f437, f385, f453, f232, f328, f447, f535, f217, f459, f161, f239, f508, f358, f352, f545, f577, f8, f565, f149, f64, f62, f354, f612, f601, f370, f160, f192, f668, f480, f527, f89, f14, f509, f88, f165, f231, f348, f591, f504, f578, f120, f544, f662, f390, f55, f158, f196, f182, f608, f139, f617, f613, f339, f586, f48, f458, f301, f646, f369, f79, f41, f263, f420, f406, f103, f666, f124, f481, f605, f260, f609, f17, f327, f95, f248, f485, f484, f28, f310, f413, f144

Time Taken¶

time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

	id	cat1	cat2	cat3	cat4	cat5	cat6	cat7	cat8	cat9	...	dummy_cat116_KC	dummy_cat116_KW	dummy_cat116_LB	dummy_cat116_LF	dummy_cat116_LM	dummy_cat116_LN	dummy_cat116_LO	dummy_cat116_LY	dummy_cat116_MD	dummy_cat116_Others
0	1	A	B	A	B	A	A	A	A	B	...	0	0	1	0	0	0	0	0	0	0
1	2	A	B	A	A	A	A	A	A	B	...	0	0	0	0	0	0	0	0	0	0

	dummy_cat1_A	dummy_cat1_B	dummy_cat2_A	dummy_cat2_B	dummy_cat3_A	dummy_cat3_B	dummy_cat4_A	dummy_cat4_B	dummy_cat5_A	dummy_cat5_B	...	cont5_boxcox1p	cont6_boxcox1p	cont7_boxcox1p	cont8_boxcox1p	cont9_boxcox1p	cont10_boxcox1p	cont11_boxcox1p	cont12_boxcox1p	cont13_boxcox1p	cont14_boxcox1p
180302	1	0	0	1	0	1	1	0	1	0	...	0.273358	0.170764	0.278357	0.269171	0.288533	0.197228	0.169389	0.167836	0.238885	0.650623
105000	1	0	0	1	1	0	0	1	1	0	...	0.361725	0.350364	0.320341	0.499342	0.378155	0.393653	0.310744	0.307905	0.327325	0.257707

	dummy_cat1_A	dummy_cat1_B	dummy_cat2_A	dummy_cat2_B	dummy_cat3_A	dummy_cat3_B	dummy_cat4_A	dummy_cat4_B	dummy_cat5_A	dummy_cat5_B	...	cont5_boxcox1p	cont6_boxcox1p	cont7_boxcox1p	cont8_boxcox1p	cont9_boxcox1p	cont10_boxcox1p	cont11_boxcox1p	cont12_boxcox1p	cont13_boxcox1p	cont14_boxcox1p
0	1	0	0	1	1	0	1	0	1	0	...	0.255586	0.401870	0.285609	0.507345	0.306570	0.335532	0.333617	0.327418	0.570153	0.345239
1	1	0	0	1	1	0	0	1	1	0	...	0.656444	0.413704	0.384638	0.576341	0.440606	0.501547	0.560053	0.551063	0.391990	0.193540

	dummy_cat1_A	dummy_cat1_B	dummy_cat2_A	dummy_cat2_B	dummy_cat3_A	dummy_cat3_B	dummy_cat4_A	dummy_cat4_B	dummy_cat5_A	dummy_cat5_B	...	cont5_boxcox1p	cont6_boxcox1p	cont7_boxcox1p	cont8_boxcox1p	cont9_boxcox1p	cont10_boxcox1p	cont11_boxcox1p	cont12_boxcox1p	cont13_boxcox1p	cont14_boxcox1p
0	0.574198	-0.574198	-1.141867	1.141867	-4.159984	4.159984	0.682029	-0.682029	0.722126	-0.722126	...	-0.902536	-1.611161	-1.020371	-0.971494	-0.936657	-1.631879	-1.584026	-1.595692	-1.124155	1.446586
1	0.574198	-0.574198	-1.141867	1.141867	0.240386	-0.240386	-1.466213	1.466213	0.722126	-0.722126	...	-0.317716	-0.413701	-0.694650	0.622825	-0.246720	-0.188110	-0.668667	-0.686765	-0.555165	-0.958409

Table of Contents

Description¶

Imports¶

Load the data¶

Data Preparation¶

Train-Validation Split¶

Scale the data¶

Modelling¶

Wrapper Functions¶

Modelling: Random Forest¶

Modelling: XGBoost¶

Modelling: Extra Trees Regressor¶

Stacking multiple models¶

Model Estimation on Test Set¶

Time Taken¶