import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()

import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
from sklearn.metrics import auc

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
pandas   1.1.0
autopep8 1.5.2
json     2.0.9
seaborn  0.11.0
numpy    1.18.4
joblib   0.16.0
sklearn  0.23.1
scipy    1.4.1


df = pd.read_csv('../data/processed/clean_data.csv.zip', compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))

(100000, 15)


X = scipy.sparse.load_npz("../data/processed/X.npz")


df.head(2)


np.array(X[0].todense())[0][-5:] # last elements of first row

array([ 0.        ,  1.        ,  0.        ,  0.69864446, 50.        ])


with open("../data/processed/features.json") as fi:
    json_features = json.load(fi)


json_features.keys()

dict_keys(['cols_ohe_before', 'cols_kbin', 'cols_log_scale', 'cols_pass', 'feature_names_before', 'feature_names_after', 'desc'])


from sklearn.model_selection import train_test_split


df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=SEED)

target = ['Frequency']

y_train = df_train[target].to_numpy().ravel()
y_test = df_test[target].to_numpy().ravel()

df_train.shape, df_test.shape, X_train.shape, X_test.shape

((75000, 15), (25000, 15), (75000, 71), (25000, 71))


from sklearn.linear_model import PoissonRegressor, GammaRegressor, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance


# TweedieRegressor?


glm_twd = TweedieRegressor(power=1.9, alpha=.1, max_iter=10_000)

glm_twd.fit(X_train, df_train["PurePremium"],
                     sample_weight=df_train["Exposure"])

TweedieRegressor(alpha=0.1, max_iter=10000, power=1.9)


tr_preds = glm_twd.predict(X_train)
tx_preds = glm_twd.predict(X_test)


# freq model: possion
glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
glm_freq.fit(X_train, df_train["Frequency"],
             sample_weight=df_train["Exposure"])

tr_preds_freq = glm_freq.predict(X_train)
tx_preds_freq = glm_freq.predict(X_test)


# severity model: gamma
mask_train = (df_train["ClaimAmount"] > 0).to_numpy().ravel()
mask_test = (df_test["ClaimAmount"] > 0).to_numpy().ravel()

glm_sev = GammaRegressor(alpha=10., max_iter=10_000)

glm_sev.fit(
    X_train[mask_train],
    df_train.loc[mask_train, "AvgClaimAmount"],
    sample_weight=df_train.loc[mask_train, "ClaimNb"],
)

tr_preds_sev = glm_sev.predict(X_train)
tx_preds_sev = glm_sev.predict(X_test)


# product of prediction of freq and severity

tr_preds = tr_preds_freq * tr_preds_sev
tx_preds  = tx_preds_freq * tx_preds_sev


from sklearn.metrics import mean_absolute_error, mean_squared_error, auc


def lorenz_curve(y_true, y_pred, exposure):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    exposure = np.asarray(exposure)

    # order samples by increasing predicted risk:
    ranking = np.argsort(y_pred)
    ranked_exposure = exposure[ranking]
    ranked_pure_premium = y_true[ranking]
    cumulated_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
    cumulated_claim_amount /= cumulated_claim_amount[-1]
    cumulated_samples = np.linspace(0, 1, len(cumulated_claim_amount))
    return cumulated_samples, cumulated_claim_amount


y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)
y_pred_total = glm_twd.predict(X_test)


fig, ax = plt.subplots(figsize=(8, 8))

for label, y_pred in [("Frequency * Severity model", y_pred_product),
                      ("Compound Poisson Gamma", y_pred_total)]:
    ordered_samples, cum_claims = lorenz_curve(
        df_test["PurePremium"], y_pred, df_test["Exposure"])
    gini = 1 - 2 * auc(ordered_samples, cum_claims)
    label += " (Gini index: {:.3f})".format(gini)
    ax.plot(ordered_samples, cum_claims, linestyle="-", label=label)

# Oracle model: y_pred == y_test
ordered_samples, cum_claims = lorenz_curve(
    df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"])
gini = 1 - 2 * auc(ordered_samples, cum_claims)
label = "Oracle (Gini index: {:.3f})".format(gini)
ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray",
        label=label)

# Random baseline
ax.plot([0, 1], [0, 1], linestyle="--", color="black",
        label="Random baseline")
ax.set(
    title="Lorenz Curves",
    xlabel=('Fraction of policyholders\n'
            '(ordered by model from safest to riskiest)'),
    ylabel='Fraction of total claim amount'
)
ax.legend(loc="upper left")
plt.plot()

[]

	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region
0	0.10	D	5	0	55	50	B12	Regular	1217	R82
1	0.77	D	5	0	55	50	B12	Regular	1217	R82
99998	0.90	C	7	9	44	50	B1	Regular	191	R24
99999	0.90	E	4	12	53	50	B1	Regular	4116	R24

	ClaimNb	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region	ClaimAmount	PurePremium	Frequency	AvgClaimAmount
0	0	0.10	D	5	0	55	50	B12	Regular	1217	R82	0.0	0.0	0.0	0.0
1	0	0.77	D	5	0	55	50	B12	Regular	1217	R82	0.0	0.0	0.0	0.0

	train	test
D2	0.020186	0.013533
mean_absolute_error	182.982035	179.520763
mean_squared_error	142995.523138	69103.013696

Table of Contents

Load the libraries¶

Load the data¶

Train Test Split¶

Pure Premium Modelling : Tweedie GLM¶

Modelling: Product of Frequency and Severity Modelling¶

Lorentz Curves¶