import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import matplotlib.pyplot as plt
sns.set()

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
pandas           1.1.0
numpy            1.18.4
pandas_profiling 2.9.0
scipy            1.4.1
autopep8         1.5.2
json             2.0.9
sklearn          0.23.1
seaborn          0.11.0


df_freq = pd.read_csv('../data/raw/freMTPL2freq.csv', nrows=100_000)
print(df_freq.shape)
df_freq.head(2).append(df_freq.tail(2))

(100000, 12)


# we need to make id column integer and set as index

df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)
df_freq.set_index('IDpol', inplace=True)


df_freq.head(2)


# now we will read severity data.


df_sev = pd.read_csv('../data/raw/freMTPL2sev.csv')
print(df_sev.shape)
df_sev.head(2).append(df_sev.tail(2))

(26639, 2)


# we will sum insurance claim per policy ID
df_sev = df_sev.groupby('IDpol').sum()
df_sev.shape

(24950, 1)


df_sev.head(2)


# join the freq and severity data


df = df_freq.join(df_sev, how='left')
df.head(2)


df['ClaimAmount'] = df['ClaimAmount'].fillna(0)


df.head(2).append(df.tail(2)).append(df.dtypes,ignore_index=True)


# severity model needs claimAmount > 0 (stritctly positive)
# make all claim numbers 0 if claim amount is 0

df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0


# clip possible outlier values and wrong data entries
df["ClaimNb"].value_counts().sort_index()

0    95262
1     4544
2      183
3        9
4        1
5        1
Name: ClaimNb, dtype: int64


df['ClaimNb'] = df['ClaimNb'].clip(upper=4)


sns.kdeplot(df.Exposure)
plt.xticks(np.arange(-0.2,1.3,0.1));


df['Exposure'].plot.hist(bins=10)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe7767dce50>


df.Exposure.describe()
# unlike kde plot shows, exposure has NO NEGATIVE values.

count    100000.000000
mean          0.583756
std           0.367475
min           0.002732
25%           0.220000
50%           0.590000
75%           1.000000
max           1.000000
Name: Exposure, dtype: float64


# we will clip upper values from 0.9


df['Exposure'] = df['Exposure'].clip(upper=0.9)


df["ClaimAmount"].describe()

count    1.000000e+05
mean     1.098624e+02
std      4.793431e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.404186e+06
Name: ClaimAmount, dtype: float64


sns.kdeplot(df['ClaimAmount'])

<matplotlib.axes._subplots.AxesSubplot at 0x7fe770c25050>


sns.kdeplot(df['ClaimAmount'].clip(upper=100_000)) # 0.1e6

# 100k is reasonalbe max amount, clip at that amount

<matplotlib.axes._subplots.AxesSubplot at 0x7fe770c69150>


df['ClaimAmount'] = df['ClaimAmount'].clip(upper=100_000)


# for frequency modelling we need these features.

df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]
df["Frequency"] = df["ClaimNb"] / df["Exposure"]
df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)

df[df.ClaimAmount > 0].head()


import pandas_profiling
profile = pandas_profiling.ProfileReport(df)

exist = True
if not exist:
    profile.to_file(output_file="../reports/html/pandas_profiling_report.html")


from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer


from sklearn import set_config

set_config(display='diagram')


log_scale_transformer = make_pipeline(
    FunctionTransformer(func=np.log),
    StandardScaler()
)


cols_ohe_before = ["VehBrand", "VehPower", "VehGas", "Region", "Area"]
cols_kbin_before = ["VehAge", "DrivAge"]
cols_log_scale = ["Density"]
cols_pass =  ["BonusMalus"]

# NOTE: make names of columns in same order as in pipeline
# cols_ohe will be obtained after fitting column transformer.
# cols_ohe + cols_bin + cols_log_scale + cols_pass

ct = ColumnTransformer(
    [
        
        # ohe
        # (note: always make ohe first item to get feature names)
        ("ohe", OneHotEncoder(handle_unknown='ignore'), cols_ohe_before),
    
        # binning
        ("kbin", KBinsDiscretizer(n_bins=8,encode='onehot',strategy='quantile'),cols_kbin_before),
        
        # log and scale
        ("log_scale", log_scale_transformer,cols_log_scale),
        
        # keep these
        ("pass", "passthrough",cols_pass),
    ],
    remainder="drop",
)
ct

ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'),
                                 ['VehBrand', 'VehPower', 'VehGas', 'Region',
                                  'Area']),
                                ('kbin', KBinsDiscretizer(n_bins=8),
                                 ['VehAge', 'DrivAge']),
                                ('log_scale',
                                 Pipeline(steps=[('functiontransformer',
                                                  FunctionTransformer(func=)),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Density']),
                                ('pass', 'passthrough', ['BonusMalus'])])

['VehBrand', 'VehPower', 'VehGas', 'Region', 'Area']

OneHotEncoder(handle_unknown='ignore')

['VehAge', 'DrivAge']

KBinsDiscretizer(n_bins=8)

['Density']

FunctionTransformer(func=)

StandardScaler()

['BonusMalus']

passthrough


X = ct.fit_transform(df)
df.shape, X.shape

((100000, 15), (100000, 71))


# we make ohe first tuple of transformer, so first index is 0
# in ohe transformer, 0 is name, 1 is actual transformer object,
# so, second index is 1
# then, we can get feature names.

cols_ohe_after = ct.transformers_[0][1].get_feature_names().tolist()

kbin_n_bins = ct.transformers_[1][1].n_bins_[0]
cols_kbin_after = [ str(i) + '_' + str(j) for i in cols_kbin_before for j in range(kbin_n_bins)]

feature_names_after = cols_ohe_after + cols_kbin_after + cols_log_scale + cols_pass

print(f"shape df : {df.shape}")
print(f"shape X  : {X.shape}")
print(f"len feature_names_after: {len(feature_names_after)}")
print(feature_names_after)

shape df : (100000, 15)
shape X  : (100000, 71)
len feature_names_after: 71
['x0_B1', 'x0_B10', 'x0_B11', 'x0_B12', 'x0_B13', 'x0_B14', 'x0_B2', 'x0_B3', 'x0_B4', 'x0_B5', 'x0_B6', 'x1_4', 'x1_5', 'x1_6', 'x1_7', 'x1_8', 'x1_9', 'x1_10', 'x1_11', 'x1_12', 'x1_13', 'x1_14', 'x1_15', 'x2_Diesel', 'x2_Regular', 'x3_R11', 'x3_R21', 'x3_R22', 'x3_R23', 'x3_R24', 'x3_R25', 'x3_R26', 'x3_R31', 'x3_R41', 'x3_R42', 'x3_R43', 'x3_R52', 'x3_R53', 'x3_R54', 'x3_R72', 'x3_R73', 'x3_R74', 'x3_R82', 'x3_R83', 'x3_R91', 'x3_R93', 'x3_R94', 'x4_A', 'x4_B', 'x4_C', 'x4_D', 'x4_E', 'x4_F', 'VehAge_0', 'VehAge_1', 'VehAge_2', 'VehAge_3', 'VehAge_4', 'VehAge_5', 'VehAge_6', 'VehAge_7', 'DrivAge_0', 'DrivAge_1', 'DrivAge_2', 'DrivAge_3', 'DrivAge_4', 'DrivAge_5', 'DrivAge_6', 'DrivAge_7', 'Density', 'BonusMalus']


np.array(X[0].todense())[0][-5:] # last elements of first row

array([ 0.        ,  1.        ,  0.        ,  0.69864446, 50.        ])


feature_names_before = cols_ohe_before + cols_kbin + cols_log_scale + cols_pass

df[feature_names_before].head(2)


import scipy
import json


df.shape

(100000, 15)


df.head(2)


# df.to_csv('../data/processed/clean_data.csv.zip',compression='zip',index=False)
# this method did not work in google colab.


df.to_csv('../data/processed/clean_data.csv',index=False)


# !zip  ../data/processed/clean_data.csv.zip ../data/processed/clean_data.csv
# do not do this, this will create folders and then files.
# cd to that directory and then zip it.


%%bash
cd ../data/processed
zip clean_data.csv.zip clean_data.csv
rm clean_data.csv
cd -

updating: clean_data.csv (deflated 84%)
/Users/poudel/github/Project_French_Motor_Claims/notebooks


type(X)

scipy.sparse.csr.csr_matrix


# scipy.sparse.save_npz?


scipy.sparse.save_npz('../data/processed/X.npz',X)


X_dense = X.todense()
df_X = pd.DataFrame(data=X_dense, columns=feature_names_after)

df_X.to_csv('../data/processed/X.csv',index=False)

(100000, 71)


%%bash

FILE="X.csv"
cd ../data/processed/

zip "$FILE".zip "$FILE"

du -sh "$FILE"
du -sh "$FILE".zip

rm "$FILE"
cd -

updating: X.csv (deflated 95%)
 29M	X.csv
1.3M	X.csv.zip
/Users/poudel/github/Project_French_Motor_Claims/notebooks


# save the features json


data_json = dict(
    cols_ohe_before = ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
    cols_kbin = ["VehAge", "DrivAge"],
    cols_log_scale = ["Density"],
    cols_pass =  ["BonusMalus"],
    feature_names_before = feature_names_before,
    feature_names_after = feature_names_after,
    desc="""
    df['ClaimNb'] = df['ClaimNb'].clip(upper=4)
    df['Exposure'] = df['Exposure'].clip(upper=0.9)
    df['ClaimAmount'] = df['ClaimAmount'].clip(upper=100_000)
    KBinsDiscretizer : n_bins=8

    """
    
)


with open("../data/processed/features.json","w") as fo:
    json.dump(data_json, fo)

	IDpol	ClaimAmount
0	1552	995.20
1	1010996	1128.12
26637	2222064	767.55
26638	2254065	1500.00

	ClaimAmount
IDpol
139	303.00
190	1981.84

Table of Contents

Description¶

Load the libraries¶

Load the data¶

Data Cleaning¶

Data Profiling¶

Data Processing for Modelling¶

Save the clean data¶

	IDpol	ClaimNb	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region
0	1.0	1	0.10	D	5	0	55	50	B12	Regular	1217	R82
1	3.0	1	0.77	D	5	0	55	50	B12	Regular	1217	R82
99998	1019554.0	0	1.00	C	7	9	44	50	B1	Regular	191	R24
99999	1019556.0	0	1.00	E	4	12	53	50	B1	Regular	4116	R24

	ClaimNb	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region	ClaimAmount	PurePremium	Frequency	AvgClaimAmount
IDpol
139	1	0.75	F	7	1	61	50	B12	Regular	27000	R11	303.00	404.000000	1.333333	303.00
190	1	0.14	B	12	5	50	60	B12	Diesel	56	R25	1981.84	14156.000000	7.142857	1981.84
414	1	0.14	E	4	0	36	85	B12	Regular	4792	R11	1456.55	10403.928571	7.142857	1456.55
424	2	0.62	F	10	0	51	100	B12	Regular	27000	R11	10834.00	17474.193548	3.225806	5417.00
463	1	0.31	A	5	0	45	50	B12	Regular	12	R73	3986.67	12860.225806	3.225806	3986.67