import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()
import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark json 2.0.9 autopep8 1.5.2 numpy 1.18.4 sklearn 0.23.1 seaborn 0.11.0 joblib 0.16.0 scipy 1.4.1 pandas 1.1.0
df = pd.read_csv('../data/processed/clean_data.csv.zip', compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))
(100000, 15)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
99998 | 0 | 0.90 | C | 7 | 9 | 44 | 50 | B1 | Regular | 191 | R24 | 0.0 | 0.0 | 0.0 | 0.0 |
99999 | 0 | 0.90 | E | 4 | 12 | 53 | 50 | B1 | Regular | 4116 | R24 | 0.0 | 0.0 | 0.0 | 0.0 |
X = scipy.sparse.load_npz("../data/processed/X.npz")
df.head(2)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
np.array(X[0].todense())[0][-5:] # last elements of first row
array([ 0. , 1. , 0. , 0.69864446, 50. ])
with open("../data/processed/features.json") as fi:
json_features = json.load(fi)
json_features.keys()
dict_keys(['cols_ohe_before', 'cols_kbin', 'cols_log_scale', 'cols_pass', 'feature_names_before', 'feature_names_after', 'desc'])
from sklearn.model_selection import train_test_split
df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=SEED)
target = ['Frequency']
y_train = df_train[target].to_numpy().ravel()
y_test = df_test[target].to_numpy().ravel()
df_train.shape, df_test.shape, X_train.shape, X_test.shape
((75000, 15), (25000, 15), (75000, 71), (25000, 71))
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
# TweedieRegressor?
glm_twd = TweedieRegressor(power=1.9, alpha=.1, max_iter=10_000)
glm_twd.fit(X_train, df_train["PurePremium"],
sample_weight=df_train["Exposure"])
TweedieRegressor(alpha=0.1, max_iter=10000, power=1.9)
tr_D2 = glm_twd.score(X_train,
df_train['PurePremium'],
sample_weight=df_train['Exposure'])
tx_D2 = glm_twd.score(X_test,
df_test['PurePremium'],
sample_weight=df_test['Exposure'])
tr_preds = glm_twd.predict(X_train)
tx_preds = glm_twd.predict(X_test)
tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)
tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)
df_eval_twd = pd.DataFrame(
{'train': [tr_D2, tr_mae, tr_mse],
'test': [tx_D2, tx_mae, tx_mse]})
df_eval_twd.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_twd
train | test | |
---|---|---|
D2 | 0.020186 | 0.013533 |
mean_absolute_error | 182.982035 | 179.520763 |
mean_squared_error | 142995.523138 | 69103.013696 |
# freq model: possion
glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
glm_freq.fit(X_train, df_train["Frequency"],
sample_weight=df_train["Exposure"])
tr_preds_freq = glm_freq.predict(X_train)
tx_preds_freq = glm_freq.predict(X_test)
# severity model: gamma
mask_train = (df_train["ClaimAmount"] > 0).to_numpy().ravel()
mask_test = (df_test["ClaimAmount"] > 0).to_numpy().ravel()
glm_sev = GammaRegressor(alpha=10., max_iter=10_000)
glm_sev.fit(
X_train[mask_train],
df_train.loc[mask_train, "AvgClaimAmount"],
sample_weight=df_train.loc[mask_train, "ClaimNb"],
)
tr_preds_sev = glm_sev.predict(X_train)
tx_preds_sev = glm_sev.predict(X_test)
# product of prediction of freq and severity
tr_preds = tr_preds_freq * tr_preds_sev
tx_preds = tx_preds_freq * tx_preds_sev
tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)
tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)
df_eval_product = pd.DataFrame(
{'train': [np.nan, tr_mae, tr_mse],
'test': [np.nan, tx_mae, tx_mse]})
df_eval_product.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_product
train | test | |
---|---|---|
D2 | NaN | NaN |
mean_absolute_error | 179.254282 | 177.081152 |
mean_squared_error | 66743.978500 | 48660.865824 |
df_eval_twd
train | test | |
---|---|---|
D2 | 0.020186 | 0.013533 |
mean_absolute_error | 182.982035 | 179.520763 |
mean_squared_error | 142995.523138 | 69103.013696 |