In this project we use the openml dataset of French Motor Vehicle Insurance Claims.
Data Source
The frequency dataset has 12 columns and 678,013 rows.
The severence dataset has 2 columns and 26,639 rows.
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()
import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
json 2.0.9 pandas 1.1.0 seaborn 0.11.0 joblib 0.16.0 autopep8 1.5.2 scipy 1.4.1 numpy 1.18.4 sklearn 0.23.1
df = pd.read_csv('../data/processed/clean_data.csv.zip', compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))
(100000, 15)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
99998 | 0 | 0.90 | C | 7 | 9 | 44 | 50 | B1 | Regular | 191 | R24 | 0.0 | 0.0 | 0.0 | 0.0 |
99999 | 0 | 0.90 | E | 4 | 12 | 53 | 50 | B1 | Regular | 4116 | R24 | 0.0 | 0.0 | 0.0 | 0.0 |
X = scipy.sparse.load_npz("../data/processed/X.npz")
df.head(2)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
np.array(X[0].todense())[0][-5:] # last elements of first row
array([ 0. , 1. , 0. , 0.69864446, 50. ])
with open("../data/processed/features.json") as fi:
json_features = json.load(fi)
json_features.keys()
dict_keys(['cols_ohe_before', 'cols_kbin', 'cols_log_scale', 'cols_pass', 'feature_names_before', 'feature_names_after', 'desc'])
from sklearn.model_selection import train_test_split
df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=SEED)
df_train.shape, df_test.shape, X_train.shape, X_test.shape
((75000, 15), (25000, 15), (75000, 71), (25000, 71))
ClaimNb
is count we can use poisson regressor.y = ClaimNb / Exposure
with sample_weight = Exposure
.PoissonRegressor
Parameters
----------
alpha : float, default=1
Constant that multiplies the penalty term and thus determines the
regularization strength. ``alpha = 0`` is equivalent to unpenalized
GLMs. In this case, the design matrix `X` must have full column rank
(no collinearities).
fit_intercept : bool, default=True
Specifies if a constant (a.k.a. bias or intercept) should be
added to the linear predictor (X @ coef + intercept).
max_iter : int, default=100
The maximal number of iterations for the solver.
tol : float, default=1e-4
Stopping criterion. For the lbfgs solver,
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
where ``g_j`` is the j-th component of the gradient (derivative) of
the objective function.
After fitting the possion regressor. We can get the model.score.
Signature: glm_freq.score(X, y, sample_weight=None)
Docstring:
Compute D^2, the percentage of deviance explained.
D^2 is a generalization of the coefficient of determination R^2.
R^2 uses squared error and D^2 deviance. Note that those two are equal
for ``family='normal'``.
Returns
-------
score : float
D^2 of self.predict(X) w.r.t. y.
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
glm_freq.fit(X_train, df_train["Frequency"],
sample_weight=df_train["Exposure"])
PoissonRegressor(alpha=0.001, max_iter=400)
joblib.dump(glm_freq, "../outputs/glm_freq.joblib")
['../outputs/glm_freq.joblib']
target = 'Frequency'
y_train = df_train[target].to_numpy().ravel()
y_test = df_test[target].to_numpy().ravel()
tr_D2 = glm_freq.score(X_train, df_train['Frequency'],
sample_weight=df_train['Exposure'])
tx_D2 = glm_freq.score(X_test, df_test['Frequency'],
sample_weight=df_test['Exposure'])
tr_preds = glm_freq.predict(X_train)
tx_preds = glm_freq.predict(X_test)
tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)
tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)
df_eval_freq = pd.DataFrame(
{'train': [tr_D2, tr_mae, tr_mse],
'test': [tx_D2, tx_mae, tx_mse]})
df_eval_freq.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_freq
train | test | |
---|---|---|
D2 | 0.051384 | 0.048138 |
mean_absolute_error | 0.232085 | 0.224547 |
mean_squared_error | 4.738399 | 2.407906 |
feature = 'DrivAge'
df_ = df_train
preds = tr_preds
observed = 'Frequency'
weight = 'Exposure'
dfx = df_.loc[:, [feature, weight]].copy()
dfx["observed"] = df_[observed] * df_[weight]
dfx["predicted"] = preds * df_[weight]
dfx = (
dfx.groupby([feature])[[weight, "observed", "predicted"]]
.sum()
.assign(observed=lambda x: x["observed"] / x[weight])
.assign(predicted=lambda x: x["predicted"] / x[weight])
)
dfx.head()
Exposure | observed | predicted | |
---|---|---|---|
DrivAge | |||
18 | 33.190929 | 0.361545 | 0.149977 |
19 | 120.625464 | 0.397926 | 0.170593 |
20 | 213.809210 | 0.229176 | 0.174087 |
21 | 288.625519 | 0.162841 | 0.160247 |
22 | 322.515673 | 0.192239 | 0.162154 |
fig,ax = plt.subplots(figsize=(8,6))
ax = dfx.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
plt.ylabel('Claim Frequency')
# fill feature distribution
y_max = dfx.loc[:, ["observed", "predicted"]].values.max()
#print(f"y_max = {y_max:.4f}")
p2 = ax.fill_between(
dfx.index,
0,
y_max * dfx[weight] / dfx[weight].values.max(), # fill between 0 to this.
color="g",
alpha=0.1,
)
plt.title(f"Train: predictions for {feature}");
sns.displot(dfx[weight],kind='kde',fill=True)
# NOTE: this is different from fill between above
# in fillbetween we fill between 0 to some maximum value of weight.
# here, we fit kde distribution.
<seaborn.axisgrid.FacetGrid at 0x7fb1a4fa1f10>
import plotly_express as px
fig = px.scatter(dfx, y=['observed','predicted'],
hover_data = {'Exposures': (':.2f', dfx['Exposure']),
'Difference': (':.3f', dfx['predicted']-dfx['observed']),
},
title=f'Training predictions for {feature}'
)
fig['layout']['title']['x'] = 0.5
fig['layout']['yaxis']['title'] = 'Claim Frequency'
fig['layout']['xaxis']['dtick'] = 10
fig
# px.scatter?