import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()

import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

json     2.0.9
pandas   1.1.0
seaborn  0.11.0
joblib   0.16.0
autopep8 1.5.2
scipy    1.4.1
numpy    1.18.4
sklearn  0.23.1


df = pd.read_csv('../data/processed/clean_data.csv.zip', compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))

(100000, 15)


X = scipy.sparse.load_npz("../data/processed/X.npz")


df.head(2)


np.array(X[0].todense())[0][-5:] # last elements of first row

array([ 0.        ,  1.        ,  0.        ,  0.69864446, 50.        ])


with open("../data/processed/features.json") as fi:
    json_features = json.load(fi)


json_features.keys()

dict_keys(['cols_ohe_before', 'cols_kbin', 'cols_log_scale', 'cols_pass', 'feature_names_before', 'feature_names_after', 'desc'])


from sklearn.model_selection import train_test_split


df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=SEED)

df_train.shape, df_test.shape, X_train.shape, X_test.shape

((75000, 15), (25000, 15), (75000, 71), (25000, 71))


from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance


glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
glm_freq.fit(X_train, df_train["Frequency"],
             sample_weight=df_train["Exposure"])

PoissonRegressor(alpha=0.001, max_iter=400)


joblib.dump(glm_freq, "../outputs/glm_freq.joblib")

['../outputs/glm_freq.joblib']


target = 'Frequency'
y_train = df_train[target].to_numpy().ravel()
y_test = df_test[target].to_numpy().ravel()


tr_D2 = glm_freq.score(X_train, df_train['Frequency'],
                       sample_weight=df_train['Exposure'])
tx_D2 = glm_freq.score(X_test, df_test['Frequency'],
                       sample_weight=df_test['Exposure'])

tr_preds = glm_freq.predict(X_train)
tx_preds = glm_freq.predict(X_test)

tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)

tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)

df_eval_freq = pd.DataFrame(
{'train': [tr_D2, tr_mae, tr_mse],
'test': [tx_D2, tx_mae, tx_mse]})

df_eval_freq.index = ['D2','mean_absolute_error','mean_squared_error']
df_eval_freq


feature = 'DrivAge'
df_ = df_train
preds = tr_preds

observed = 'Frequency'
weight = 'Exposure'

dfx = df_.loc[:, [feature, weight]].copy()
dfx["observed"]  = df_[observed] * df_[weight]
dfx["predicted"] = preds * df_[weight]

dfx = (
    dfx.groupby([feature])[[weight, "observed", "predicted"]]
    .sum()
    .assign(observed=lambda x: x["observed"] / x[weight])
    .assign(predicted=lambda x: x["predicted"] / x[weight])
)

dfx.head()


fig,ax = plt.subplots(figsize=(8,6))

ax = dfx.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
plt.ylabel('Claim Frequency')

# fill feature distribution
y_max = dfx.loc[:, ["observed", "predicted"]].values.max()
#print(f"y_max = {y_max:.4f}")
p2 = ax.fill_between(
    dfx.index,
    0,
    y_max * dfx[weight] / dfx[weight].values.max(), # fill between 0 to this.
    color="g",
    alpha=0.1,
)

plt.title(f"Train: predictions for {feature}");


sns.displot(dfx[weight],kind='kde',fill=True)

# NOTE: this is different from fill between above
# in fillbetween we fill between 0 to some maximum value of weight.
# here, we fit kde distribution.

<seaborn.axisgrid.FacetGrid at 0x7fb1a4fa1f10>


import plotly_express as px


fig = px.scatter(dfx, y=['observed','predicted'],
                 hover_data = {'Exposures': (':.2f', dfx['Exposure']),
                               'Difference': (':.3f', dfx['predicted']-dfx['observed']),
                              },
                 title=f'Training predictions for {feature}'
                )

fig['layout']['title']['x'] = 0.5
fig['layout']['yaxis']['title'] = 'Claim Frequency'
fig['layout']['xaxis']['dtick'] = 10
fig


# px.scatter?

	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region
0	0.10	D	5	0	55	50	B12	Regular	1217	R82
1	0.77	D	5	0	55	50	B12	Regular	1217	R82
99998	0.90	C	7	9	44	50	B1	Regular	191	R24
99999	0.90	E	4	12	53	50	B1	Regular	4116	R24

	ClaimNb	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region	ClaimAmount	PurePremium	Frequency	AvgClaimAmount
0	0	0.10	D	5	0	55	50	B12	Regular	1217	R82	0.0	0.0	0.0	0.0
1	0	0.77	D	5	0	55	50	B12	Regular	1217	R82	0.0	0.0	0.0	0.0

	train	test
D2	0.051384	0.048138
mean_absolute_error	0.232085	0.224547
mean_squared_error	4.738399	2.407906

	Exposure	observed	predicted
DrivAge
18	33.190929	0.361545	0.149977
19	120.625464	0.397926	0.170593
20	213.809210	0.229176	0.174087
21	288.625519	0.162841	0.160247
22	322.515673	0.192239	0.162154

Table of Contents

Description¶

Load the libraries¶

Load the data¶

Train Test Split¶

Frequency model : Poisson distribution¶

Model Evaluation¶