import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import json
import matplotlib.pyplot as plt
sns.set()

import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance


SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

pandas   1.1.0
scipy    1.4.1
joblib   0.16.0
seaborn  0.11.0
autopep8 1.5.2
sklearn  0.23.1
numpy    1.18.4
json     2.0.9


df = pd.read_csv('../data/processed/clean_data.csv.zip', compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))

(100000, 15)


X = scipy.sparse.load_npz("../data/processed/X.npz")


df.head(2)


np.array(X[0].todense())[0][-5:] # last elements of first row

array([ 0.        ,  1.        ,  0.        ,  0.69864446, 50.        ])


with open("../data/processed/features.json") as fi:
    json_features = json.load(fi)


json_features.keys()

dict_keys(['cols_ohe_before', 'cols_kbin', 'cols_log_scale', 'cols_pass', 'feature_names_before', 'feature_names_after', 'desc'])


from sklearn.model_selection import train_test_split


df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=SEED)

df_train.shape, df_test.shape, X_train.shape, X_test.shape

((75000, 15), (25000, 15), (75000, 71), (25000, 71))


from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance


df.head(2)


mask_train = (df_train["ClaimAmount"] > 0).to_numpy().ravel()
mask_test = (df_test["ClaimAmount"] > 0).to_numpy().ravel()

glm_sev = GammaRegressor(alpha=10., max_iter=10_000)

glm_sev.fit(
    X_train[mask_train],
    df_train.loc[mask_train, "AvgClaimAmount"],
    sample_weight=df_train.loc[mask_train, "ClaimNb"],
)

GammaRegressor(alpha=10.0, max_iter=10000)


joblib.dump(glm_sev, "../outputs/glm_sev.joblib")

['../outputs/glm_sev.joblib']


np.savez_compressed('../outputs/mask_train_test.npz',
                   mask_train=mask_train,mask_test=mask_test)


with open("../outputs/mask_train_test.json","w") as fo:
    json.dump({"mask_train": "(df_train['ClaimAmount'] > 0).to_numpy().ravel()",
              "mask_test": "(df_test['ClaimAmount'] > 0).to_numpy().ravel()"
              },fo)


# glm_sev.score?


target = 'AvgClaimAmount'
y_train = df_train[target].to_numpy().ravel()
y_test = df_test[target].to_numpy().ravel()

tr_D2 = glm_sev.score(X_train[mask_train],
                      df_train.loc[mask_train, 'AvgClaimAmount'],
                      sample_weight=df_train.loc[mask_train, 'ClaimNb'])

tx_D2 = glm_sev.score(X_test[mask_test],
                      df_test.loc[mask_test, 'AvgClaimAmount'],
                      sample_weight=df_test.loc[mask_test, 'ClaimNb'])

tr_preds = glm_sev.predict(X_train)
tx_preds = glm_sev.predict(X_test)

tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)

tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)

df_eval_sev = pd.DataFrame(
{'train': [tr_D2, tr_mae, tr_mse],
'test': [tx_D2, tx_mae, tx_mse]})

df_eval_sev.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_sev


# NOTE
note = """
Here the D-squared value for test is too bad. This is because when doing train
modelling we have masked values with claim amount greater than 0, not zero.

The test results are for average claims per claim.
We can not use this model to predict the average claim amount per policy.


"""


print("Mean AvgClaim Amount per policy:              %.2f "
      % df_train["AvgClaimAmount"].mean())
print("Mean AvgClaim Amount | NbClaim > 0:           %.2f"
      % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean())
print("Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
      % glm_sev.predict(X_train).mean())


feature = 'DrivAge'
df_ = df_train
preds = tr_preds

observed = 'AvgClaimAmount' # Frequency for freq-modelling
weight = 'ClaimNb'

dfx = df_.loc[:, [feature, weight]].copy()
dfx["observed"]  = df_[observed] * df_[weight]
dfx["predicted"] = preds * df_[weight]

dfx = (
    dfx.groupby([feature])[[weight, "observed", "predicted"]]
    .sum()
    .assign(observed=lambda x: x["observed"] / x[weight])
    .assign(predicted=lambda x: x["predicted"] / x[weight])
    .dropna(how='any')
)

dfx.head()


fig,ax = plt.subplots(figsize=(12,6))

ax = dfx.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
plt.ylabel('Average Claim Frequency')

# fill feature distribution
y_max = dfx.loc[:, ["observed", "predicted"]].values.max()
print(f"y_max = {y_max:.4f}")
p2 = ax.fill_between(
    dfx.index,
    0,
    y_max * dfx[weight] / dfx[weight].values.max()  , # fill between 0 to this.
    color="g",
    alpha=0.1,
)

plt.xticks(range(10,110,10))
plt.title(f"Train: predictions for {feature}");


_ = """
Here, The predicted line is almost flat.
We can see that driver age (DrivAge) has not much impact on Average Claim Frequency.


"""

	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region
0	0.10	D	5	0	55	50	B12	Regular	1217	R82
1	0.77	D	5	0	55	50	B12	Regular	1217	R82
99998	0.90	C	7	9	44	50	B1	Regular	191	R24
99999	0.90	E	4	12	53	50	B1	Regular	4116	R24

	ClaimNb	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region	ClaimAmount	PurePremium	Frequency	AvgClaimAmount
0	0	0.10	D	5	0	55	50	B12	Regular	1217	R82	0.0	0.0	0.0	0.0
1	0	0.77	D	5	0	55	50	B12	Regular	1217	R82	0.0	0.0	0.0	0.0

	ClaimNb	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region	ClaimAmount	PurePremium	Frequency	AvgClaimAmount
0	0	0.10	D	5	0	55	50	B12	Regular	1217	R82	0.0	0.0	0.0	0.0
1	0	0.77	D	5	0	55	50	B12	Regular	1217	R82	0.0	0.0	0.0	0.0

	train	test
D2	3.638157e-03	-4.747382e-04
mean_absolute_error	1.859814e+03	1.856312e+03
mean_squared_error	4.959565e+06	4.827662e+06

Table of Contents

Load the libraries¶

Load the data¶

Train Test Split¶

Frequency model : Gamma distribution¶

Model Evaluation¶