The Allstate Corporation is an American insurance company that is in the United States. The company also has personal lines insurance operations in Canada.
References
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# random state
SEED=100
[(x.__name__,x.__version__) for x in [np,pd,sns]]
from scipy.special import boxcox1p
import scipy
import sklearn
print([(x.__name__,x.__version__) for x in [scipy, sklearn]])
# scale and split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# regressors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
# pipeline
from sklearn.pipeline import Pipeline
# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# cross validation
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
# classifier
import xgboost as xgb
# six and pickle
import six
import pickle
import joblib
%load_ext autoreload
%autoreload 2
# my personal library
from bhishan import bp
path_pro = '../data/processed'
df_train = pd.read_csv(f'{path_pro}/train_cleaned_encoded.csv',index_col=0)
df_test = pd.read_csv(f'{path_pro}/test_cleaned_encoded.csv',index_col=0)
print(df_train.shape)
df_train.head(2)
df_train.shape, df_test.shape
features_only_train = df_train.columns.difference(df_test.columns).tolist()
print(features_only_train)
features_only_test = df_test.columns.difference(df_train.columns).tolist()
print(features_only_test)
features_only_one = features_only_train + features_only_test
print(features_only_one)
"""
continuous features: xxx_boxcox1p
categorical features: dummy_xxx
target: loss_log1p
""";
features_raw = df_train.columns.tolist()
target = 'loss'
target_log = 'loss_log1p'
# orig
features_orig_cont = [i for i in features_raw
if i.startswith('cont')
if not i.endswith('_boxcox1p')
]
features_orig_cat = [i for i in features_raw
if i.startswith('cat') ]
features_orig = features_orig_cat + features_orig_cont
# processed
features_cont = [i for i in features_raw if i.endswith('_boxcox1p') ]
features_cat = [i for i in features_raw if i.startswith('dummy_') ]
features = features_cat + features_cont
features = [i for i in features if i not in features_only_one]
# print
# print(features_orig_cont)
# print(features_orig_cat)
# print(features_cont)
# print(features_cat)
df_Xtr, df_Xval, ser_ytr, ser_yval = train_test_split(
df_train[features],df_train[target],
train_size=0.8,random_state=SEED)
Xtr = df_Xtr.to_numpy()
ytr = ser_ytr.to_numpy().ravel()
yval = ser_yval.to_numpy().ravel()
print(df_Xtr.shape, ser_ytr.shape,ytr.shape)
df_Xtr.head(2)
df_Xtx = df_test[features]
Xtx = df_Xtx.to_numpy()
ser_ytx = None
ytx = None
# in kaggle there is no ytest, we need to submit it.
print(df_Xtx.shape)
df_Xtx.head(2)
scaler = StandardScaler()
Xtr_scaled = scaler.fit_transform(df_Xtr)
Xval_scaled = scaler.fit_transform(df_Xval)
print(type(Xtr_scaled))
df_Xtr_scaled = pd.DataFrame(Xtr_scaled,columns=df_Xtr.columns)
df_Xval_scaled = pd.DataFrame(Xval_scaled,columns=df_Xval.columns)
df_Xtr_scaled.head(2)
class SklearnWrapper(object):
def __init__(self, clf, seed=100, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, Xtrain, ytrain):
self.clf.fit(Xtrain, np.log(ytrain))
def predict(self, x):
return np.exp(self.clf.predict(x))
class XgbWrapper(object):
def __init__(self, seed=100, params=None):
self.param = params
self.param['seed'] = seed
self.nrounds = params.pop('nrounds', 250)
def train(self, Xtrain, ytrain):
dtrain = xgb.DMatrix(Xtrain, label=np.log(ytrain))
self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
def predict(self, x):
return np.exp(self.gbdt.predict(xgb.DMatrix(x)))
def get_oof(clf,Xtr, ytr,
Xtx,ytx,
NFOLDS):
"""Out of Fold Estimation.
Parameters:
------------
clf: Wrapper class having train and predict methods.
Xtr: Numpy array of training data
ytr: Numpy array of training target data
Xtx: Numpy array of test data
ytx: Numpy array of test label data
NFOLDS: Number of fold
"""
ntrain = Xtr.shape[0]
ntest = Xtx.shape[0]
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_kf = np.empty((NFOLDS, ntest))
kf = KFold(n_splits=NFOLDS,
shuffle=True,
random_state=SEED)
for i, (idx_tr, idx_tx) in enumerate(kf.split(Xtr,ytr)):
xtr_now = Xtr[idx_tr]
ytr_now = ytr[idx_tr]
xtx_now = Xtr[idx_tx]
clf.train(xtr_now, ytr_now)
oof_train[idx_tx] = clf.predict(xtx_now)
oof_test_kf[i, :] = clf.predict(Xtx)
oof_test[:] = oof_test_kf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
rf_params = {
'n_jobs': -1,
'n_estimators': 100,
'max_features': 0.2,
'max_depth': 8,
'min_samples_leaf': 2,
}
rf = SklearnWrapper(clf=RandomForestRegressor,
seed=SEED,
params=rf_params)
%%time
NFOLDS = 2
rf = SklearnWrapper(clf=RandomForestRegressor,
seed=SEED,
params=rf_params)
rf_oof_train, rf_oof_test = get_oof(rf,Xtr,ytr,
Xtx,ytx,NFOLDS)
print("RF-CV: {}".format(mean_absolute_error(ytr, rf_oof_train)))
xgb_params = {
'seed': 0,
'colsample_bytree': 0.7,
'silent': 1,
'subsample': 0.7,
'learning_rate': 0.075,
'objective': 'reg:linear',
'max_depth': 7,
'num_parallel_tree': 1,
'min_child_weight': 1,
'eval_metric': 'mae',
'nrounds': 350
}
%%time
xg = XgbWrapper(seed=SEED,
params=xgb_params)
xg_oof_train, xg_oof_test = get_oof(xg,Xtr,ytr,
Xtx,ytx,NFOLDS)
print("XG-CV: {}".format(mean_absolute_error(ytr,xg_oof_train)))
et_params = {
'n_jobs': -1,
'n_estimators': 100,
'max_features': 0.5,
'max_depth': 12,
'min_samples_leaf': 2,
}
%%time
et = SklearnWrapper(clf=ExtraTreesRegressor,
seed=SEED,
params=et_params)
et_oof_train, et_oof_test = get_oof(et,Xtr,ytr,
Xtx,ytx,NFOLDS)
print("ET-CV: {}".format(mean_absolute_error(ytr,et_oof_train)))
def xg_eval_mae(yhat, dtrain):
y = dtrain.get_label()
return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat))
Xtr_stacked = np.c_[xg_oof_train,
et_oof_train,
rf_oof_train]
Xtx_stacked = np.c_[xg_oof_test,
et_oof_test,
rf_oof_test]
dtrain = xgb.DMatrix(Xtr_stacked, label=np.log(ytr))
dtest = xgb.DMatrix(Xtx)
xgb_params = {
'seed': SEED,
'colsample_bytree': 0.8,
'silent': 1,
'subsample': 0.6,
'learning_rate': 0.01,
'objective': 'reg:linear',
'max_depth': 4,
'num_parallel_tree': 1,
'min_child_weight': 1,
'eval_metric': 'mae',
}
res = xgb.cv(xgb_params,
dtrain,
num_boost_round=500,
nfold=4,
seed=SEED,
stratified=False,
early_stopping_rounds=25,
verbose_eval=10,
show_stdv=True,
feval=xg_eval_mae,
maximize=False)
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]
print('Stacked-CV: {0}+{1}'.format(cv_mean, cv_std))
# dtrain = xgb.DMatrix(Xtr_stacked, label=np.log(ytr))
# dtest = xgb.DMatrix(Xtx)
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)
output = np.exp(gbdt.predict(dtest))
output[:5]
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))