%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install watermark
    !pip install scikit-plot
    !pip install -U scikit-learn # tweedie needs > 0.23

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.


import scikitplot as skplot


TREE_METHOD = 'auto'
try:
    import tensorflow as tf
    has_gpu = tf.test.gpu_device_name()
    TREE_METHOD = 'gpu_hist' if has_gpu else 'auto'
except:
    TREE_METHOD = 'auto'

print(TREE_METHOD)

auto


!pwd
!ls

/content
sample_data


if ENV_COLAB:
    # raw
    !wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv?raw=true"
    !wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv?raw=true"

    # processed
    !wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip?raw=true"
    !wget -c "https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/features.json"
    !wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/X.csv.zip?raw=true"
    !wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/X.npz?raw=true"

--2020-09-27 15:07:11--  https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv?raw=true
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv [following]
--2020-09-27 15:07:11--  https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv [following]
--2020-09-27 15:07:11--  https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31528920 (30M) [text/plain]
Saving to: ‘freMTPL2freq.csv?raw=true’

freMTPL2freq.csv?ra 100%[===================>]  30.07M  39.2MB/s    in 0.8s    

2020-09-27 15:07:12 (39.2 MB/s) - ‘freMTPL2freq.csv?raw=true’ saved [31528920/31528920]

--2020-09-27 15:07:12--  https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv?raw=true
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv [following]
--2020-09-27 15:07:12--  https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv [following]
--2020-09-27 15:07:12--  https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 376493 (368K) [text/plain]
Saving to: ‘freMTPL2sev.csv?raw=true’

freMTPL2sev.csv?raw 100%[===================>] 367.67K  --.-KB/s    in 0.05s   

2020-09-27 15:07:13 (6.66 MB/s) - ‘freMTPL2sev.csv?raw=true’ saved [376493/376493]

--2020-09-27 15:07:13--  https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip?raw=true
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip [following]
--2020-09-27 15:07:13--  https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip [following]
--2020-09-27 15:07:13--  https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 902582 (881K) [application/zip]
Saving to: ‘clean_data.csv.zip?raw=true’

clean_data.csv.zip? 100%[===================>] 881.43K  --.-KB/s    in 0.1s    

2020-09-27 15:07:14 (7.44 MB/s) - ‘clean_data.csv.zip?raw=true’ saved [902582/902582]

--2020-09-27 15:07:14--  https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/features.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1272 (1.2K) [text/plain]
Saving to: ‘features.json’

features.json       100%[===================>]   1.24K  --.-KB/s    in 0s      

2020-09-27 15:07:14 (72.3 MB/s) - ‘features.json’ saved [1272/1272]

--2020-09-27 15:07:14--  https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/X.csv.zip?raw=true
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/X.csv.zip [following]
--2020-09-27 15:07:14--  https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/X.csv.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/X.csv.zip [following]
--2020-09-27 15:07:14--  https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/X.csv.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1369068 (1.3M) [application/zip]
Saving to: ‘X.csv.zip?raw=true’

X.csv.zip?raw=true  100%[===================>]   1.30M  --.-KB/s    in 0.1s    

2020-09-27 15:07:14 (11.3 MB/s) - ‘X.csv.zip?raw=true’ saved [1369068/1369068]

--2020-09-27 15:07:15--  https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/X.npz?raw=true
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/X.npz [following]
--2020-09-27 15:07:15--  https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/X.npz
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/X.npz [following]
--2020-09-27 15:07:15--  https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/X.npz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1030881 (1007K) [application/octet-stream]
Saving to: ‘X.npz?raw=true’

X.npz?raw=true      100%[===================>]   1007K  --.-KB/s    in 0.09s   

2020-09-27 15:07:15 (10.7 MB/s) - ‘X.npz?raw=true’ saved [1030881/1030881]

!ls

'clean_data.csv.zip?raw=true'  'freMTPL2sev.csv?raw=true'  'X.npz?raw=true'
 features.json		        sample_data
'freMTPL2freq.csv?raw=true'    'X.csv.zip?raw=true'


if ENV_COLAB:

    # clean old dir
    !rm -rf ../data

    !mkdir -p ../data/processed
    !mkdir -p ../data/raw
    !mkdir -p ../outputs

    !mv freMTPL2freq.csv?raw=true ../data/raw/freMTPL2freq.csv
    !mv freMTPL2sev.csv?raw=true ../data/raw/freMTPL2sev.csv

    !mv features.json ../data/processed/features.json
    !mv clean_data.csv.zip?raw=true ../data/processed/clean_data.csv.zip
    !mv X.csv.zip?raw=true ../data/processed/X.csv.zip
    !mv X.npz?raw=true ../data/processed/X.npz

!ls

sample_data


!ls ../data/raw

freMTPL2freq.csv  freMTPL2sev.csv


!ls ../data/processed

clean_data.csv.zip  features.json  X.csv.zip  X.npz


import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()

import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
from sklearn.metrics import auc

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

numpy      1.18.5
pandas     1.0.5
scikitplot 0.3.7
seaborn    0.10.1
sklearn    0.23.2
scipy      1.4.1
joblib     0.16.0
json       2.0.9
tensorflow 2.3.0

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm


%%bash
pwd

/content


ifile = '../data/processed/clean_data.csv.zip'
# ifile = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip?raw=true"


df = pd.read_csv(ifile, compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))

(100000, 15)


X = scipy.sparse.load_npz("../data/processed/X.npz")
X.shape, type(X)

((100000, 71), scipy.sparse.csr.csr_matrix)


# ifile = '../data/processed/X.csv.zip'
# df_X = pd.read_csv(ifile, compression='zip')
# print(df_X.shape)
# df_X.head(2).append(df_X.tail(2))

# columns
"""
x0_B1	x0_B10	x0_B11	x0_B12	x0_B13	x0_B14	x0_B2	x0_B3	x0_B4	x0_B5
x0_B6	x1_4	x1_5	x1_6	x1_7	x1_8	x1_9	x1_10	x1_11	x1_12
x1_13	x1_14	x1_15	x2_Diesel	x2_Regular	x3_R11	x3_R21	x3_R22	x3_R23
x3_R24	x3_R25	x3_R26	x3_R31	x3_R41	x3_R42	x3_R43	x3_R52	x3_R53	x3_R54
x3_R72	x3_R73	x3_R74	x3_R82	x3_R83	x3_R91	x3_R93	x3_R94	x4_A	x4_B
x4_C	x4_D	x4_E	x4_F	VehAge_0	VehAge_1	VehAge_2	VehAge_3
VehAge_4	VehAge_5	VehAge_6	VehAge_7	DrivAge_0	DrivAge_1	DrivAge_2	
DrivAge_3	DrivAge_4	DrivAge_5	DrivAge_6	DrivAge_7	Density	BonusMalus

X only have transformed version of these columns
cols_ohe_before = ["VehBrand", "VehPower", "VehGas", "Region", "Area"]
cols_kbin_before = ["VehAge", "DrivAge"]
cols_log_scale = ["Density"]
cols_pass =  ["BonusMalus"]
""";


df.head(2)


np.array(X[0].todense())[0][-5:] # last elements of first row

array([ 0.        ,  1.        ,  0.        ,  0.69864446, 50.        ])


with open("../data/processed/features.json") as fi:
    json_features = json.load(fi)


json_features.keys()

dict_keys(['cols_ohe_before', 'cols_kbin', 'cols_log_scale', 'cols_pass', 'feature_names_before', 'feature_names_after', 'desc'])


from sklearn.model_selection import train_test_split


df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=SEED)

df_train.shape, df_test.shape, X_train.shape, X_test.shape

((75000, 15), (25000, 15), (75000, 71), (25000, 71))


import xgboost as xgb


target = 'PurePremium'

y_train = df_train[target].to_numpy().ravel()
y_test = df_test[target].to_numpy().ravel()

dtrain = xgb.DMatrix(data=X_train.todense(),label=y_train)
dtest = xgb.DMatrix(data=X_test.todense(), label=y_test)


# apply offsets
# Ref: https://towardsdatascience.com/offsetting-the-model-logic-to-implementation-7e333bc25798
# there are different ways to apply offsets for different modules.
# statsmodel: exposure
# xgboost: set_base_margin with log
# lightgbm: set_init_score()
dtrain.set_base_margin(np.log(df_train['Exposure'].to_numpy()))
dtest.set_base_margin(np.log(df_test['Exposure'].to_numpy()))


%%time
params = {
    'objective':'reg:tweedie',
    'colsample_bytree': 1.0,
    'learning_rate': 0.01,
    'gamma':1.5,
    'max_depth': 2,
    'subsample':0.6, 
    'reg_alpha': 0,
    'reg_lambda':1,
    'min_child_weight':5,
    'n_estimators':2000,
    'tweedie_variance_power':1.9}

model_xgb = xgb.train(params=params, dtrain=dtrain, num_boost_round=1000)

CPU times: user 3min 2s, sys: 250 ms, total: 3min 2s
Wall time: 1min 32s


tr_preds = model_xgb.predict(dtrain)
tx_preds = model_xgb.predict(dtest)


tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)

tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)

df_eval_twd = pd.DataFrame(
{'train': [np.nan, tr_mae, tr_mse],
'test': [np.nan, tx_mae, tx_mse]})

df_eval_twd.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_twd


%%time
cv_results = xgb.cv(dtrain=dtrain, params=params, nfold=3,
                    num_boost_round=2000,
                    early_stopping_rounds=10,
                    metrics="rmse",
                    as_pandas=True,
                    seed=SEED)

CPU times: user 3min 13s, sys: 341 ms, total: 3min 13s
Wall time: 1min 38s


cv_results.shape

(448, 4)


cv_results.tail()

	train	test
D2	NaN	NaN
mean_absolute_error	1.760538e+03	1.588351e+03
mean_squared_error	1.481952e+09	1.659363e+08

	train-rmse-mean	train-rmse-std	test-rmse-mean	test-rmse-std
443	34847.445312	16248.746864	28785.776855	25490.221577
444	34847.441732	16248.751926	28785.770670	25490.226078
445	34847.438151	16248.754184	28785.763672	25490.228446
446	34847.434245	16248.756901	28785.758463	25490.232558
447	34847.427409	16248.758325	28785.754232	25490.235459

Table of Contents

Colab¶

Load the libraries¶

Load the data¶

Train Test Split¶

Modelling: Xgboost Tweedie regressor¶

	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region
0	0.10	D	5	0	55	50	B12	Regular	1217	R82
1	0.77	D	5	0	55	50	B12	Regular	1217	R82
99998	0.90	C	7	9	44	50	B1	Regular	191	R24
99999	0.90	E	4	12	53	50	B1	Regular	4116	R24