import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()

import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
from sklearn.metrics import auc

SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv

sklearn    0.22.1
scikitplot 0.3.7
numpy      1.19.1
pandas     1.1.1
json       2.0.9
seaborn    0.11.0
scipy      1.4.1
joblib     0.16.0


%%bash
pwd

/Users/poudel/github/Data_Science/a01_Modules/pyGAM/example


# ifile = '../data/processed/clean_data.csv.zip'
ifile = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip?raw=true"
df = pd.read_csv(ifile, compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))

(100000, 15)


# ifile = "../data/processed/X.npz"
ifile = os.path.expanduser("~/github/Project_French_Motor_Claims/data/processed/X.npz")

X = scipy.sparse.load_npz(ifile)
X.shape, type(X)

((100000, 71), scipy.sparse.csr.csr_matrix)


# ifile = '../data/processed/X.csv.zip'
# df_X = pd.read_csv(ifile, compression='zip')
# print(df_X.shape)
# df_X.head(2).append(df_X.tail(2))

# columns
"""
x0_B1	x0_B10	x0_B11	x0_B12	x0_B13	x0_B14	x0_B2	x0_B3	x0_B4	x0_B5
x0_B6	x1_4	x1_5	x1_6	x1_7	x1_8	x1_9	x1_10	x1_11	x1_12
x1_13	x1_14	x1_15	x2_Diesel	x2_Regular	x3_R11	x3_R21	x3_R22	x3_R23
x3_R24	x3_R25	x3_R26	x3_R31	x3_R41	x3_R42	x3_R43	x3_R52	x3_R53	x3_R54
x3_R72	x3_R73	x3_R74	x3_R82	x3_R83	x3_R91	x3_R93	x3_R94	x4_A	x4_B
x4_C	x4_D	x4_E	x4_F	VehAge_0	VehAge_1	VehAge_2	VehAge_3
VehAge_4	VehAge_5	VehAge_6	VehAge_7	DrivAge_0	DrivAge_1	DrivAge_2	
DrivAge_3	DrivAge_4	DrivAge_5	DrivAge_6	DrivAge_7	Density	BonusMalus

X only have transformed version of these columns
cols_ohe_before = ["VehBrand", "VehPower", "VehGas", "Region", "Area"]
cols_kbin_before = ["VehAge", "DrivAge"]
cols_log_scale = ["Density"]
cols_pass =  ["BonusMalus"]
""";


df.head(2)


np.array(X[0].todense())[0][-5:] # last elements of first row

array([ 0.        ,  1.        ,  0.        ,  0.69864446, 50.        ])


from sklearn.model_selection import train_test_split


df_train, df_test, X_train, X_test = train_test_split(df, X.todense(), random_state=SEED)

df_train.shape, df_test.shape, X_train.shape, X_test.shape

((75000, 15), (25000, 15), (75000, 71), (25000, 71))


import pygam


df_train.head(2)


y_train = df_train["AvgClaimAmount"].values
y_test = df_test["AvgClaimAmount"].values


%%time
gam = pygam.LinearGAM(n_splines=10).gridsearch(X_train, y_train)
gam.summary()

100% (11 of 11) |########################| Elapsed Time: 0:03:51 Time:  0:03:51

LinearGAM                                                                                                 
=============================================== ==========================================================
Distribution:                        NormalDist Effective DoF:                                     67.4214
Link Function:                     IdentityLink Log Likelihood:                               -1148621.477
Number of Samples:                        75000 AIC:                                          2297379.7967
                                                AICc:                                         2297379.9234
                                                GCV:                                          1789834.1192
                                                Scale:                                        1786938.1978
                                                Pseudo R-Squared:                                   0.0019
==========================================================================================================
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
================================= ==================== ============ ============ ============ ============
s(0)                              [1000.]              10           2.0          3.20e-01                 
s(1)                              [1000.]              10           1.0          5.96e-01                 
s(2)                              [1000.]              10           1.0          8.65e-01                 
s(3)                              [1000.]              10           1.0          7.54e-03     **          
s(4)                              [1000.]              10           1.0          6.50e-01                 
s(5)                              [1000.]              10           1.0          7.21e-01                 
s(6)                              [1000.]              10           1.0          5.34e-01                 
s(7)                              [1000.]              10           1.0          2.19e-01                 
s(8)                              [1000.]              10           1.0          8.93e-01                 
s(9)                              [1000.]              10           1.0          9.36e-01                 
s(10)                             [1000.]              10           0.0          9.48e-01                 
s(11)                             [1000.]              10           1.0          1.45e-01                 
s(12)                             [1000.]              10           1.0          6.30e-01                 
s(13)                             [1000.]              10           1.0          9.99e-01                 
s(14)                             [1000.]              10           1.0          8.30e-01                 
s(15)                             [1000.]              10           1.0          9.99e-01                 
s(16)                             [1000.]              10           1.0          8.99e-01                 
s(17)                             [1000.]              10           1.0          9.97e-01                 
s(18)                             [1000.]              10           1.0          2.42e-01                 
s(19)                             [1000.]              10           1.0          9.52e-01                 
s(20)                             [1000.]              10           1.0          9.58e-01                 
s(21)                             [1000.]              10           1.0          9.26e-01                 
s(22)                             [1000.]              10           0.0          9.85e-01                 
s(23)                             [1000.]              10           1.0          2.02e-01                 
s(24)                             [1000.]              10           0.0          2.01e-01                 
s(25)                             [1000.]              10           1.0          8.89e-01                 
s(26)                             [1000.]              10           1.0          9.74e-01                 
s(27)                             [1000.]              10           1.0          9.70e-01                 
s(28)                             [1000.]              10           1.0          7.62e-01                 
s(29)                             [1000.]              10           1.0          9.63e-01                 
s(30)                             [1000.]              10           1.0          4.49e-01                 
s(31)                             [1000.]              10           1.0          5.19e-01                 
s(32)                             [1000.]              10           1.0          3.74e-01                 
s(33)                             [1000.]              10           1.0          8.68e-01                 
s(34)                             [1000.]              10           1.0          9.94e-01                 
s(35)                             [1000.]              10           1.0          4.80e-01                 
s(36)                             [1000.]              10           1.0          7.30e-01                 
s(37)                             [1000.]              10           1.0          5.25e-01                 
s(38)                             [1000.]              10           1.0          4.69e-01                 
s(39)                             [1000.]              10           1.0          9.86e-01                 
s(40)                             [1000.]              10           1.0          8.00e-01                 
s(41)                             [1000.]              10           1.0          9.06e-01                 
s(42)                             [1000.]              10           1.0          9.80e-01                 
s(43)                             [1000.]              10           1.0          5.60e-01                 
s(44)                             [1000.]              10           1.0          6.78e-01                 
s(45)                             [1000.]              10           1.0          5.06e-01                 
s(46)                             [1000.]              10           0.0          9.73e-01                 
s(47)                             [1000.]              10           1.0          1.54e-01                 
s(48)                             [1000.]              10           1.0          6.93e-01                 
s(49)                             [1000.]              10           1.0          9.96e-01                 
s(50)                             [1000.]              10           1.0          3.37e-01                 
s(51)                             [1000.]              10           1.0          5.18e-01                 
s(52)                             [1000.]              10           0.0          5.36e-01                 
s(53)                             [1000.]              10           1.0          5.31e-01                 
s(54)                             [1000.]              10           1.0          5.95e-01                 
s(55)                             [1000.]              10           1.0          9.89e-01                 
s(56)                             [1000.]              10           1.0          6.76e-01                 
s(57)                             [1000.]              10           1.0          1.96e-01                 
s(58)                             [1000.]              10           1.0          9.40e-01                 
s(59)                             [1000.]              10           1.0          9.99e-01                 
s(60)                             [1000.]              10           0.0          1.10e-02     *           
s(61)                             [1000.]              10           1.0          3.82e-01                 
s(62)                             [1000.]              10           1.0          4.75e-03     **          
s(63)                             [1000.]              10           1.0          4.91e-02     *           
s(64)                             [1000.]              10           1.0          9.63e-01                 
s(65)                             [1000.]              10           1.0          8.52e-01                 
s(66)                             [1000.]              10           1.0          8.71e-02     .           
s(67)                             [1000.]              10           1.0          8.52e-01                 
s(68)                             [1000.]              10           0.0          1.27e-02     *           
s(69)                             [1000.]              10           2.5          5.89e-01                 
s(70)                             [1000.]              10           1.9          1.11e-16     ***         
intercept                                              1            0.0          4.13e-07     ***         
==========================================================================================================
Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem
         which can cause p-values to appear significant when they are not.

WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with
         known smoothing parameters, but when smoothing parameters have been estimated, the p-values
         are typically lower than they should be, meaning that the tests reject the null too readily.
CPU times: user 9min 2s, sys: 29.9 s, total: 9min 32s
Wall time: 3min 51s

/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/ipykernel_launcher.py:2: UserWarning: KNOWN BUG: p-values computed in this summary are likely much smaller than they should be. 
 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163


from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
from sklearn.metrics import auc


tr_preds = gam.predict(X_train)
tx_preds = gam.predict(X_test)


tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)

tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)

df_eval_gam = pd.DataFrame(
{'train': [tr_mae, tr_mse],
'test': [tx_mae, tx_mse]})

df_eval_gam.index = ['mean_absolute_error','mean_squared_error']
df_eval_gam

Table of Contents

Load the libraries¶

Load the data¶

Train Test Split¶

Modelling: pyGAM¶

Model evaluation¶

	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region
0	0.10	D	5	0	55	50	B12	Regular	1217	R82
1	0.77	D	5	0	55	50	B12	Regular	1217	R82
99998	0.90	C	7	9	44	50	B1	Regular	191	R24
99999	0.90	E	4	12	53	50	B1	Regular	4116	R24

	ClaimNb	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region	ClaimAmount	PurePremium	Frequency	AvgClaimAmount
17853	0	0.35	D	6	7	25	147	B1	Diesel	745	R82	0.0	0.0	0.0	0.0
55890	0	0.61	E	5	1	24	72	B3	Diesel	3673	R25	0.0	0.0	0.0	0.0

	train	test
mean_absolute_error	1.686438e+02	1.655408e+02
mean_squared_error	1.785332e+06	1.647533e+06