%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
!pip install scikit-plot
!pip install -U scikit-learn # tweedie needs > 0.23
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import scikitplot as skplot
TREE_METHOD = 'auto'
try:
import tensorflow as tf
has_gpu = tf.test.gpu_device_name()
TREE_METHOD = 'gpu_hist' if has_gpu else 'auto'
except:
TREE_METHOD = 'auto'
print(TREE_METHOD)
auto
!pwd
!ls
/content sample_data
if ENV_COLAB:
# raw
!wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv?raw=true"
!wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv?raw=true"
# processed
!wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip?raw=true"
!wget -c "https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/features.json"
!wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/X.csv.zip?raw=true"
!wget -c "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/X.npz?raw=true"
--2020-09-27 15:07:11-- https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv?raw=true Resolving github.com (github.com)... 140.82.112.3 Connecting to github.com (github.com)|140.82.112.3|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv [following] --2020-09-27 15:07:11-- https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv Reusing existing connection to github.com:443. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv [following] --2020-09-27 15:07:11-- https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/raw/freMTPL2freq.csv Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 31528920 (30M) [text/plain] Saving to: ‘freMTPL2freq.csv?raw=true’ freMTPL2freq.csv?ra 100%[===================>] 30.07M 39.2MB/s in 0.8s 2020-09-27 15:07:12 (39.2 MB/s) - ‘freMTPL2freq.csv?raw=true’ saved [31528920/31528920] --2020-09-27 15:07:12-- https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv?raw=true Resolving github.com (github.com)... 140.82.113.3 Connecting to github.com (github.com)|140.82.113.3|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv [following] --2020-09-27 15:07:12-- https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv Reusing existing connection to github.com:443. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv [following] --2020-09-27 15:07:12-- https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/raw/freMTPL2sev.csv Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 376493 (368K) [text/plain] Saving to: ‘freMTPL2sev.csv?raw=true’ freMTPL2sev.csv?raw 100%[===================>] 367.67K --.-KB/s in 0.05s 2020-09-27 15:07:13 (6.66 MB/s) - ‘freMTPL2sev.csv?raw=true’ saved [376493/376493] --2020-09-27 15:07:13-- https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip?raw=true Resolving github.com (github.com)... 140.82.113.3 Connecting to github.com (github.com)|140.82.113.3|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip [following] --2020-09-27 15:07:13-- https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip Reusing existing connection to github.com:443. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip [following] --2020-09-27 15:07:13-- https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 902582 (881K) [application/zip] Saving to: ‘clean_data.csv.zip?raw=true’ clean_data.csv.zip? 100%[===================>] 881.43K --.-KB/s in 0.1s 2020-09-27 15:07:14 (7.44 MB/s) - ‘clean_data.csv.zip?raw=true’ saved [902582/902582] --2020-09-27 15:07:14-- https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/features.json Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 1272 (1.2K) [text/plain] Saving to: ‘features.json’ features.json 100%[===================>] 1.24K --.-KB/s in 0s 2020-09-27 15:07:14 (72.3 MB/s) - ‘features.json’ saved [1272/1272] --2020-09-27 15:07:14-- https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/X.csv.zip?raw=true Resolving github.com (github.com)... 140.82.112.3 Connecting to github.com (github.com)|140.82.112.3|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/X.csv.zip [following] --2020-09-27 15:07:14-- https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/X.csv.zip Reusing existing connection to github.com:443. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/X.csv.zip [following] --2020-09-27 15:07:14-- https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/X.csv.zip Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 1369068 (1.3M) [application/zip] Saving to: ‘X.csv.zip?raw=true’ X.csv.zip?raw=true 100%[===================>] 1.30M --.-KB/s in 0.1s 2020-09-27 15:07:14 (11.3 MB/s) - ‘X.csv.zip?raw=true’ saved [1369068/1369068] --2020-09-27 15:07:15-- https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/X.npz?raw=true Resolving github.com (github.com)... 140.82.114.4 Connecting to github.com (github.com)|140.82.114.4|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/X.npz [following] --2020-09-27 15:07:15-- https://github.com/bhishanpdl/Datasets/raw/master/Projects/French_Motor_Claims/processed/X.npz Reusing existing connection to github.com:443. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/X.npz [following] --2020-09-27 15:07:15-- https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/French_Motor_Claims/processed/X.npz Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 1030881 (1007K) [application/octet-stream] Saving to: ‘X.npz?raw=true’ X.npz?raw=true 100%[===================>] 1007K --.-KB/s in 0.09s 2020-09-27 15:07:15 (10.7 MB/s) - ‘X.npz?raw=true’ saved [1030881/1030881]
!ls
'clean_data.csv.zip?raw=true' 'freMTPL2sev.csv?raw=true' 'X.npz?raw=true' features.json sample_data 'freMTPL2freq.csv?raw=true' 'X.csv.zip?raw=true'
if ENV_COLAB:
# clean old dir
!rm -rf ../data
!mkdir -p ../data/processed
!mkdir -p ../data/raw
!mkdir -p ../outputs
!mv freMTPL2freq.csv?raw=true ../data/raw/freMTPL2freq.csv
!mv freMTPL2sev.csv?raw=true ../data/raw/freMTPL2sev.csv
!mv features.json ../data/processed/features.json
!mv clean_data.csv.zip?raw=true ../data/processed/clean_data.csv.zip
!mv X.csv.zip?raw=true ../data/processed/X.csv.zip
!mv X.npz?raw=true ../data/processed/X.npz
!ls
sample_data
!ls ../data/raw
freMTPL2freq.csv freMTPL2sev.csv
!ls ../data/processed
clean_data.csv.zip features.json X.csv.zip X.npz
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()
import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
from sklearn.metrics import auc
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
numpy 1.18.5 pandas 1.0.5 scikitplot 0.3.7 seaborn 0.10.1 sklearn 0.23.2 scipy 1.4.1 joblib 0.16.0 json 2.0.9 tensorflow 2.3.0
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
%%bash
pwd
/content
ifile = '../data/processed/clean_data.csv.zip'
# ifile = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip?raw=true"
df = pd.read_csv(ifile, compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))
(100000, 15)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
99998 | 0 | 0.90 | C | 7 | 9 | 44 | 50 | B1 | Regular | 191 | R24 | 0.0 | 0.0 | 0.0 | 0.0 |
99999 | 0 | 0.90 | E | 4 | 12 | 53 | 50 | B1 | Regular | 4116 | R24 | 0.0 | 0.0 | 0.0 | 0.0 |
X = scipy.sparse.load_npz("../data/processed/X.npz")
X.shape, type(X)
((100000, 71), scipy.sparse.csr.csr_matrix)
# ifile = '../data/processed/X.csv.zip'
# df_X = pd.read_csv(ifile, compression='zip')
# print(df_X.shape)
# df_X.head(2).append(df_X.tail(2))
# columns
"""
x0_B1 x0_B10 x0_B11 x0_B12 x0_B13 x0_B14 x0_B2 x0_B3 x0_B4 x0_B5
x0_B6 x1_4 x1_5 x1_6 x1_7 x1_8 x1_9 x1_10 x1_11 x1_12
x1_13 x1_14 x1_15 x2_Diesel x2_Regular x3_R11 x3_R21 x3_R22 x3_R23
x3_R24 x3_R25 x3_R26 x3_R31 x3_R41 x3_R42 x3_R43 x3_R52 x3_R53 x3_R54
x3_R72 x3_R73 x3_R74 x3_R82 x3_R83 x3_R91 x3_R93 x3_R94 x4_A x4_B
x4_C x4_D x4_E x4_F VehAge_0 VehAge_1 VehAge_2 VehAge_3
VehAge_4 VehAge_5 VehAge_6 VehAge_7 DrivAge_0 DrivAge_1 DrivAge_2
DrivAge_3 DrivAge_4 DrivAge_5 DrivAge_6 DrivAge_7 Density BonusMalus
X only have transformed version of these columns
cols_ohe_before = ["VehBrand", "VehPower", "VehGas", "Region", "Area"]
cols_kbin_before = ["VehAge", "DrivAge"]
cols_log_scale = ["Density"]
cols_pass = ["BonusMalus"]
""";
df.head(2)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
np.array(X[0].todense())[0][-5:] # last elements of first row
array([ 0. , 1. , 0. , 0.69864446, 50. ])
with open("../data/processed/features.json") as fi:
json_features = json.load(fi)
json_features.keys()
dict_keys(['cols_ohe_before', 'cols_kbin', 'cols_log_scale', 'cols_pass', 'feature_names_before', 'feature_names_after', 'desc'])
from sklearn.model_selection import train_test_split
df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=SEED)
df_train.shape, df_test.shape, X_train.shape, X_test.shape
((75000, 15), (25000, 15), (75000, 71), (25000, 71))
import xgboost as xgb
target = 'PurePremium'
y_train = df_train[target].to_numpy().ravel()
y_test = df_test[target].to_numpy().ravel()
dtrain = xgb.DMatrix(data=X_train.todense(),label=y_train)
dtest = xgb.DMatrix(data=X_test.todense(), label=y_test)
# apply offsets
# Ref: https://towardsdatascience.com/offsetting-the-model-logic-to-implementation-7e333bc25798
# there are different ways to apply offsets for different modules.
# statsmodel: exposure
# xgboost: set_base_margin with log
# lightgbm: set_init_score()
dtrain.set_base_margin(np.log(df_train['Exposure'].to_numpy()))
dtest.set_base_margin(np.log(df_test['Exposure'].to_numpy()))
%%time
params = {
'objective':'reg:tweedie',
'colsample_bytree': 1.0,
'learning_rate': 0.01,
'gamma':1.5,
'max_depth': 2,
'subsample':0.6,
'reg_alpha': 0,
'reg_lambda':1,
'min_child_weight':5,
'n_estimators':2000,
'tweedie_variance_power':1.9}
model_xgb = xgb.train(params=params, dtrain=dtrain, num_boost_round=1000)
CPU times: user 3min 2s, sys: 250 ms, total: 3min 2s Wall time: 1min 32s
tr_preds = model_xgb.predict(dtrain)
tx_preds = model_xgb.predict(dtest)
tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)
tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)
df_eval_twd = pd.DataFrame(
{'train': [np.nan, tr_mae, tr_mse],
'test': [np.nan, tx_mae, tx_mse]})
df_eval_twd.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_twd
train | test | |
---|---|---|
D2 | NaN | NaN |
mean_absolute_error | 1.760538e+03 | 1.588351e+03 |
mean_squared_error | 1.481952e+09 | 1.659363e+08 |
%%time
cv_results = xgb.cv(dtrain=dtrain, params=params, nfold=3,
num_boost_round=2000,
early_stopping_rounds=10,
metrics="rmse",
as_pandas=True,
seed=SEED)
CPU times: user 3min 13s, sys: 341 ms, total: 3min 13s Wall time: 1min 38s
cv_results.shape
(448, 4)
cv_results.tail()
train-rmse-mean | train-rmse-std | test-rmse-mean | test-rmse-std | |
---|---|---|---|---|
443 | 34847.445312 | 16248.746864 | 28785.776855 | 25490.221577 |
444 | 34847.441732 | 16248.751926 | 28785.770670 | 25490.226078 |
445 | 34847.438151 | 16248.754184 | 28785.763672 | 25490.228446 |
446 | 34847.434245 | 16248.756901 | 28785.758463 | 25490.232558 |
447 | 34847.427409 | 16248.758325 | 28785.754232 | 25490.235459 |