In this project we use the openml dataset of French Motor Vehicle Insurance Claims.
Data Source
The frequency dataset has 12 columns and 678,013 rows.
The severence dataset has 2 columns and 26,639 rows.
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import matplotlib.pyplot as plt
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark pandas 1.1.0 numpy 1.18.4 pandas_profiling 2.9.0 scipy 1.4.1 autopep8 1.5.2 json 2.0.9 sklearn 0.23.1 seaborn 0.11.0
df_freq = pd.read_csv('../data/raw/freMTPL2freq.csv', nrows=100_000)
print(df_freq.shape)
df_freq.head(2).append(df_freq.tail(2))
(100000, 12)
IDpol | ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 |
1 | 3.0 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 |
99998 | 1019554.0 | 0 | 1.00 | C | 7 | 9 | 44 | 50 | B1 | Regular | 191 | R24 |
99999 | 1019556.0 | 0 | 1.00 | E | 4 | 12 | 53 | 50 | B1 | Regular | 4116 | R24 |
# we need to make id column integer and set as index
df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)
df_freq.set_index('IDpol', inplace=True)
df_freq.head(2)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | |
---|---|---|---|---|---|---|---|---|---|---|---|
IDpol | |||||||||||
1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 |
3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 |
# now we will read severity data.
df_sev = pd.read_csv('../data/raw/freMTPL2sev.csv')
print(df_sev.shape)
df_sev.head(2).append(df_sev.tail(2))
(26639, 2)
IDpol | ClaimAmount | |
---|---|---|
0 | 1552 | 995.20 |
1 | 1010996 | 1128.12 |
26637 | 2222064 | 767.55 |
26638 | 2254065 | 1500.00 |
# we will sum insurance claim per policy ID
df_sev = df_sev.groupby('IDpol').sum()
df_sev.shape
(24950, 1)
df_sev.head(2)
ClaimAmount | |
---|---|
IDpol | |
139 | 303.00 |
190 | 1981.84 |
# join the freq and severity data
df = df_freq.join(df_sev, how='left')
df.head(2)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
IDpol | ||||||||||||
1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | NaN |
3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | NaN |
df['ClaimAmount'] = df['ClaimAmount'].fillna(0)
df.head(2).append(df.tail(2)).append(df.dtypes,ignore_index=True)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.1 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0 |
1 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0 |
2 | 0 | 1 | C | 7 | 9 | 44 | 50 | B1 | Regular | 191 | R24 | 0 |
3 | 0 | 1 | E | 4 | 12 | 53 | 50 | B1 | Regular | 4116 | R24 | 0 |
4 | int64 | float64 | object | int64 | int64 | int64 | int64 | object | object | int64 | object | float64 |
# severity model needs claimAmount > 0 (stritctly positive)
# make all claim numbers 0 if claim amount is 0
df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0
# clip possible outlier values and wrong data entries
df["ClaimNb"].value_counts().sort_index()
0 95262 1 4544 2 183 3 9 4 1 5 1 Name: ClaimNb, dtype: int64
df['ClaimNb'] = df['ClaimNb'].clip(upper=4)
sns.kdeplot(df.Exposure)
plt.xticks(np.arange(-0.2,1.3,0.1));
df['Exposure'].plot.hist(bins=10)
<matplotlib.axes._subplots.AxesSubplot at 0x7fe7767dce50>
df.Exposure.describe()
# unlike kde plot shows, exposure has NO NEGATIVE values.
count 100000.000000 mean 0.583756 std 0.367475 min 0.002732 25% 0.220000 50% 0.590000 75% 1.000000 max 1.000000 Name: Exposure, dtype: float64
# we will clip upper values from 0.9
df['Exposure'] = df['Exposure'].clip(upper=0.9)
df["ClaimAmount"].describe()
count 1.000000e+05 mean 1.098624e+02 std 4.793431e+03 min 0.000000e+00 25% 0.000000e+00 50% 0.000000e+00 75% 0.000000e+00 max 1.404186e+06 Name: ClaimAmount, dtype: float64
sns.kdeplot(df['ClaimAmount'])
<matplotlib.axes._subplots.AxesSubplot at 0x7fe770c25050>
sns.kdeplot(df['ClaimAmount'].clip(upper=100_000)) # 0.1e6
# 100k is reasonalbe max amount, clip at that amount
<matplotlib.axes._subplots.AxesSubplot at 0x7fe770c69150>
df['ClaimAmount'] = df['ClaimAmount'].clip(upper=100_000)
# for frequency modelling we need these features.
df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]
df["Frequency"] = df["ClaimNb"] / df["Exposure"]
df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)
df[df.ClaimAmount > 0].head()
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
IDpol | |||||||||||||||
139 | 1 | 0.75 | F | 7 | 1 | 61 | 50 | B12 | Regular | 27000 | R11 | 303.00 | 404.000000 | 1.333333 | 303.00 |
190 | 1 | 0.14 | B | 12 | 5 | 50 | 60 | B12 | Diesel | 56 | R25 | 1981.84 | 14156.000000 | 7.142857 | 1981.84 |
414 | 1 | 0.14 | E | 4 | 0 | 36 | 85 | B12 | Regular | 4792 | R11 | 1456.55 | 10403.928571 | 7.142857 | 1456.55 |
424 | 2 | 0.62 | F | 10 | 0 | 51 | 100 | B12 | Regular | 27000 | R11 | 10834.00 | 17474.193548 | 3.225806 | 5417.00 |
463 | 1 | 0.31 | A | 5 | 0 | 45 | 50 | B12 | Regular | 12 | R73 | 3986.67 | 12860.225806 | 3.225806 | 3986.67 |
import pandas_profiling
profile = pandas_profiling.ProfileReport(df)
exist = True
if not exist:
profile.to_file(output_file="../reports/html/pandas_profiling_report.html")
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn import set_config
set_config(display='diagram')
log_scale_transformer = make_pipeline(
FunctionTransformer(func=np.log),
StandardScaler()
)
cols_ohe_before = ["VehBrand", "VehPower", "VehGas", "Region", "Area"]
cols_kbin_before = ["VehAge", "DrivAge"]
cols_log_scale = ["Density"]
cols_pass = ["BonusMalus"]
# NOTE: make names of columns in same order as in pipeline
# cols_ohe will be obtained after fitting column transformer.
# cols_ohe + cols_bin + cols_log_scale + cols_pass
ct = ColumnTransformer(
[
# ohe
# (note: always make ohe first item to get feature names)
("ohe", OneHotEncoder(handle_unknown='ignore'), cols_ohe_before),
# binning
("kbin", KBinsDiscretizer(n_bins=8,encode='onehot',strategy='quantile'),cols_kbin_before),
# log and scale
("log_scale", log_scale_transformer,cols_log_scale),
# keep these
("pass", "passthrough",cols_pass),
],
remainder="drop",
)
ct
ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), ['VehBrand', 'VehPower', 'VehGas', 'Region', 'Area']), ('kbin', KBinsDiscretizer(n_bins=8), ['VehAge', 'DrivAge']), ('log_scale', Pipeline(steps=[('functiontransformer', FunctionTransformer(func=)), ('standardscaler', StandardScaler())]), ['Density']), ('pass', 'passthrough', ['BonusMalus'])])
['VehBrand', 'VehPower', 'VehGas', 'Region', 'Area']
OneHotEncoder(handle_unknown='ignore')
['VehAge', 'DrivAge']
KBinsDiscretizer(n_bins=8)
['Density']
FunctionTransformer(func=)
StandardScaler()
['BonusMalus']
passthrough
X = ct.fit_transform(df)
df.shape, X.shape
((100000, 15), (100000, 71))
# we make ohe first tuple of transformer, so first index is 0
# in ohe transformer, 0 is name, 1 is actual transformer object,
# so, second index is 1
# then, we can get feature names.
cols_ohe_after = ct.transformers_[0][1].get_feature_names().tolist()
kbin_n_bins = ct.transformers_[1][1].n_bins_[0]
cols_kbin_after = [ str(i) + '_' + str(j) for i in cols_kbin_before for j in range(kbin_n_bins)]
feature_names_after = cols_ohe_after + cols_kbin_after + cols_log_scale + cols_pass
print(f"shape df : {df.shape}")
print(f"shape X : {X.shape}")
print(f"len feature_names_after: {len(feature_names_after)}")
print(feature_names_after)
shape df : (100000, 15) shape X : (100000, 71) len feature_names_after: 71 ['x0_B1', 'x0_B10', 'x0_B11', 'x0_B12', 'x0_B13', 'x0_B14', 'x0_B2', 'x0_B3', 'x0_B4', 'x0_B5', 'x0_B6', 'x1_4', 'x1_5', 'x1_6', 'x1_7', 'x1_8', 'x1_9', 'x1_10', 'x1_11', 'x1_12', 'x1_13', 'x1_14', 'x1_15', 'x2_Diesel', 'x2_Regular', 'x3_R11', 'x3_R21', 'x3_R22', 'x3_R23', 'x3_R24', 'x3_R25', 'x3_R26', 'x3_R31', 'x3_R41', 'x3_R42', 'x3_R43', 'x3_R52', 'x3_R53', 'x3_R54', 'x3_R72', 'x3_R73', 'x3_R74', 'x3_R82', 'x3_R83', 'x3_R91', 'x3_R93', 'x3_R94', 'x4_A', 'x4_B', 'x4_C', 'x4_D', 'x4_E', 'x4_F', 'VehAge_0', 'VehAge_1', 'VehAge_2', 'VehAge_3', 'VehAge_4', 'VehAge_5', 'VehAge_6', 'VehAge_7', 'DrivAge_0', 'DrivAge_1', 'DrivAge_2', 'DrivAge_3', 'DrivAge_4', 'DrivAge_5', 'DrivAge_6', 'DrivAge_7', 'Density', 'BonusMalus']
np.array(X[0].todense())[0][-5:] # last elements of first row
array([ 0. , 1. , 0. , 0.69864446, 50. ])
feature_names_before = cols_ohe_before + cols_kbin + cols_log_scale + cols_pass
df[feature_names_before].head(2)
VehBrand | VehPower | VehGas | Region | Area | VehAge | DrivAge | Density | BonusMalus | |
---|---|---|---|---|---|---|---|---|---|
IDpol | |||||||||
1 | B12 | 5 | Regular | R82 | D | 0 | 55 | 1217 | 50 |
3 | B12 | 5 | Regular | R82 | D | 0 | 55 | 1217 | 50 |
import scipy
import json
df.shape
(100000, 15)
df.head(2)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
IDpol | |||||||||||||||
1 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
# df.to_csv('../data/processed/clean_data.csv.zip',compression='zip',index=False)
# this method did not work in google colab.
df.to_csv('../data/processed/clean_data.csv',index=False)
# !zip ../data/processed/clean_data.csv.zip ../data/processed/clean_data.csv
# do not do this, this will create folders and then files.
# cd to that directory and then zip it.
%%bash
cd ../data/processed
zip clean_data.csv.zip clean_data.csv
rm clean_data.csv
cd -
updating: clean_data.csv (deflated 84%) /Users/poudel/github/Project_French_Motor_Claims/notebooks
type(X)
scipy.sparse.csr.csr_matrix
# scipy.sparse.save_npz?
scipy.sparse.save_npz('../data/processed/X.npz',X)
X_dense = X.todense()
df_X = pd.DataFrame(data=X_dense, columns=feature_names_after)
df_X.to_csv('../data/processed/X.csv',index=False)
(100000, 71)
%%bash
FILE="X.csv"
cd ../data/processed/
zip "$FILE".zip "$FILE"
du -sh "$FILE"
du -sh "$FILE".zip
rm "$FILE"
cd -
updating: X.csv (deflated 95%) 29M X.csv 1.3M X.csv.zip /Users/poudel/github/Project_French_Motor_Claims/notebooks
# save the features json
data_json = dict(
cols_ohe_before = ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
cols_kbin = ["VehAge", "DrivAge"],
cols_log_scale = ["Density"],
cols_pass = ["BonusMalus"],
feature_names_before = feature_names_before,
feature_names_after = feature_names_after,
desc="""
df['ClaimNb'] = df['ClaimNb'].clip(upper=4)
df['Exposure'] = df['Exposure'].clip(upper=0.9)
df['ClaimAmount'] = df['ClaimAmount'].clip(upper=100_000)
KBinsDiscretizer : n_bins=8
"""
)
with open("../data/processed/features.json","w") as fo:
json.dump(data_json, fo)