import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import sklearn
import scipy
import matplotlib.pyplot as plt
sns.set()
import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
from sklearn.metrics import auc
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
sklearn 0.22.1 scikitplot 0.3.7 numpy 1.19.1 pandas 1.1.1 json 2.0.9 seaborn 0.11.0 scipy 1.4.1 joblib 0.16.0
%%bash
pwd
/Users/poudel/github/Data_Science/a01_Modules/pyGAM/example
# ifile = '../data/processed/clean_data.csv.zip'
ifile = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/French_Motor_Claims/processed/clean_data.csv.zip?raw=true"
df = pd.read_csv(ifile, compression='zip')
print(df.shape)
df.head(2).append(df.tail(2))
(100000, 15)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
99998 | 0 | 0.90 | C | 7 | 9 | 44 | 50 | B1 | Regular | 191 | R24 | 0.0 | 0.0 | 0.0 | 0.0 |
99999 | 0 | 0.90 | E | 4 | 12 | 53 | 50 | B1 | Regular | 4116 | R24 | 0.0 | 0.0 | 0.0 | 0.0 |
# ifile = "../data/processed/X.npz"
ifile = os.path.expanduser("~/github/Project_French_Motor_Claims/data/processed/X.npz")
X = scipy.sparse.load_npz(ifile)
X.shape, type(X)
((100000, 71), scipy.sparse.csr.csr_matrix)
# ifile = '../data/processed/X.csv.zip'
# df_X = pd.read_csv(ifile, compression='zip')
# print(df_X.shape)
# df_X.head(2).append(df_X.tail(2))
# columns
"""
x0_B1 x0_B10 x0_B11 x0_B12 x0_B13 x0_B14 x0_B2 x0_B3 x0_B4 x0_B5
x0_B6 x1_4 x1_5 x1_6 x1_7 x1_8 x1_9 x1_10 x1_11 x1_12
x1_13 x1_14 x1_15 x2_Diesel x2_Regular x3_R11 x3_R21 x3_R22 x3_R23
x3_R24 x3_R25 x3_R26 x3_R31 x3_R41 x3_R42 x3_R43 x3_R52 x3_R53 x3_R54
x3_R72 x3_R73 x3_R74 x3_R82 x3_R83 x3_R91 x3_R93 x3_R94 x4_A x4_B
x4_C x4_D x4_E x4_F VehAge_0 VehAge_1 VehAge_2 VehAge_3
VehAge_4 VehAge_5 VehAge_6 VehAge_7 DrivAge_0 DrivAge_1 DrivAge_2
DrivAge_3 DrivAge_4 DrivAge_5 DrivAge_6 DrivAge_7 Density BonusMalus
X only have transformed version of these columns
cols_ohe_before = ["VehBrand", "VehPower", "VehGas", "Region", "Area"]
cols_kbin_before = ["VehAge", "DrivAge"]
cols_log_scale = ["Density"]
cols_pass = ["BonusMalus"]
""";
df.head(2)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
np.array(X[0].todense())[0][-5:] # last elements of first row
array([ 0. , 1. , 0. , 0.69864446, 50. ])
from sklearn.model_selection import train_test_split
df_train, df_test, X_train, X_test = train_test_split(df, X.todense(), random_state=SEED)
df_train.shape, df_test.shape, X_train.shape, X_test.shape
((75000, 15), (25000, 15), (75000, 71), (25000, 71))
Ref: https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html
Method link distribution
----------------------------------------------------------
LinearGAM identity normal distribution
LogisticGAM logit link binomial distribution
PoissonGAM log Poisson distribution
GammaGAM log gamma distribution
InvGauss log inv_gauss distribution
LinearGAM $\mathbb{E}[y \mid X]=\beta_{0}+f_{1}\left(X_{1}\right)+f_{2}\left(X_{2}, X 3\right)+\cdots+f_{M}\left(X_{N}\right)$
Parameters
Terms
l() linear terms
s() spline terms
f() factor terms
te() tensor products
intercept
Callbacks
Callbacks are performed during each optimization iteration. It’s also easy to write your own.
deviance - model deviance
diffs - differences of coefficient norm
accuracy - model accuracy for LogisticGAM
coef - coefficient logging
import pygam
df_train.head(2)
ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | ClaimAmount | PurePremium | Frequency | AvgClaimAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
17853 | 0 | 0.35 | D | 6 | 7 | 25 | 147 | B1 | Diesel | 745 | R82 | 0.0 | 0.0 | 0.0 | 0.0 |
55890 | 0 | 0.61 | E | 5 | 1 | 24 | 72 | B3 | Diesel | 3673 | R25 | 0.0 | 0.0 | 0.0 | 0.0 |
y_train = df_train["AvgClaimAmount"].values
y_test = df_test["AvgClaimAmount"].values
%%time
gam = pygam.LinearGAM(n_splines=10).gridsearch(X_train, y_train)
gam.summary()
100% (11 of 11) |########################| Elapsed Time: 0:03:51 Time: 0:03:51
LinearGAM =============================================== ========================================================== Distribution: NormalDist Effective DoF: 67.4214 Link Function: IdentityLink Log Likelihood: -1148621.477 Number of Samples: 75000 AIC: 2297379.7967 AICc: 2297379.9234 GCV: 1789834.1192 Scale: 1786938.1978 Pseudo R-Squared: 0.0019 ========================================================================================================== Feature Function Lambda Rank EDoF P > x Sig. Code ================================= ==================== ============ ============ ============ ============ s(0) [1000.] 10 2.0 3.20e-01 s(1) [1000.] 10 1.0 5.96e-01 s(2) [1000.] 10 1.0 8.65e-01 s(3) [1000.] 10 1.0 7.54e-03 ** s(4) [1000.] 10 1.0 6.50e-01 s(5) [1000.] 10 1.0 7.21e-01 s(6) [1000.] 10 1.0 5.34e-01 s(7) [1000.] 10 1.0 2.19e-01 s(8) [1000.] 10 1.0 8.93e-01 s(9) [1000.] 10 1.0 9.36e-01 s(10) [1000.] 10 0.0 9.48e-01 s(11) [1000.] 10 1.0 1.45e-01 s(12) [1000.] 10 1.0 6.30e-01 s(13) [1000.] 10 1.0 9.99e-01 s(14) [1000.] 10 1.0 8.30e-01 s(15) [1000.] 10 1.0 9.99e-01 s(16) [1000.] 10 1.0 8.99e-01 s(17) [1000.] 10 1.0 9.97e-01 s(18) [1000.] 10 1.0 2.42e-01 s(19) [1000.] 10 1.0 9.52e-01 s(20) [1000.] 10 1.0 9.58e-01 s(21) [1000.] 10 1.0 9.26e-01 s(22) [1000.] 10 0.0 9.85e-01 s(23) [1000.] 10 1.0 2.02e-01 s(24) [1000.] 10 0.0 2.01e-01 s(25) [1000.] 10 1.0 8.89e-01 s(26) [1000.] 10 1.0 9.74e-01 s(27) [1000.] 10 1.0 9.70e-01 s(28) [1000.] 10 1.0 7.62e-01 s(29) [1000.] 10 1.0 9.63e-01 s(30) [1000.] 10 1.0 4.49e-01 s(31) [1000.] 10 1.0 5.19e-01 s(32) [1000.] 10 1.0 3.74e-01 s(33) [1000.] 10 1.0 8.68e-01 s(34) [1000.] 10 1.0 9.94e-01 s(35) [1000.] 10 1.0 4.80e-01 s(36) [1000.] 10 1.0 7.30e-01 s(37) [1000.] 10 1.0 5.25e-01 s(38) [1000.] 10 1.0 4.69e-01 s(39) [1000.] 10 1.0 9.86e-01 s(40) [1000.] 10 1.0 8.00e-01 s(41) [1000.] 10 1.0 9.06e-01 s(42) [1000.] 10 1.0 9.80e-01 s(43) [1000.] 10 1.0 5.60e-01 s(44) [1000.] 10 1.0 6.78e-01 s(45) [1000.] 10 1.0 5.06e-01 s(46) [1000.] 10 0.0 9.73e-01 s(47) [1000.] 10 1.0 1.54e-01 s(48) [1000.] 10 1.0 6.93e-01 s(49) [1000.] 10 1.0 9.96e-01 s(50) [1000.] 10 1.0 3.37e-01 s(51) [1000.] 10 1.0 5.18e-01 s(52) [1000.] 10 0.0 5.36e-01 s(53) [1000.] 10 1.0 5.31e-01 s(54) [1000.] 10 1.0 5.95e-01 s(55) [1000.] 10 1.0 9.89e-01 s(56) [1000.] 10 1.0 6.76e-01 s(57) [1000.] 10 1.0 1.96e-01 s(58) [1000.] 10 1.0 9.40e-01 s(59) [1000.] 10 1.0 9.99e-01 s(60) [1000.] 10 0.0 1.10e-02 * s(61) [1000.] 10 1.0 3.82e-01 s(62) [1000.] 10 1.0 4.75e-03 ** s(63) [1000.] 10 1.0 4.91e-02 * s(64) [1000.] 10 1.0 9.63e-01 s(65) [1000.] 10 1.0 8.52e-01 s(66) [1000.] 10 1.0 8.71e-02 . s(67) [1000.] 10 1.0 8.52e-01 s(68) [1000.] 10 0.0 1.27e-02 * s(69) [1000.] 10 2.5 5.89e-01 s(70) [1000.] 10 1.9 1.11e-16 *** intercept 1 0.0 4.13e-07 *** ========================================================================================================== Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem which can cause p-values to appear significant when they are not. WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with known smoothing parameters, but when smoothing parameters have been estimated, the p-values are typically lower than they should be, meaning that the tests reject the null too readily. CPU times: user 9min 2s, sys: 29.9 s, total: 9min 32s Wall time: 3min 51s
/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/ipykernel_launcher.py:2: UserWarning: KNOWN BUG: p-values computed in this summary are likely much smaller than they should be. Please do not make inferences based on these values! Collaborate on a solution, and stay up to date at: github.com/dswah/pyGAM/issues/163
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance
from sklearn.metrics import auc
tr_preds = gam.predict(X_train)
tx_preds = gam.predict(X_test)
tr_mae = mean_absolute_error(y_train,tr_preds)
tx_mae = mean_absolute_error(y_test,tx_preds)
tr_mse = mean_squared_error(y_train, tr_preds)
tx_mse = mean_squared_error(y_test,tx_preds)
df_eval_gam = pd.DataFrame(
{'train': [tr_mae, tr_mse],
'test': [tx_mae, tx_mse]})
df_eval_gam.index = ['mean_absolute_error','mean_squared_error']
df_eval_gam
train | test | |
---|---|---|
mean_absolute_error | 1.686438e+02 | 1.655408e+02 |
mean_squared_error | 1.785332e+06 | 1.647533e+06 |