Description

The Allstate Corporation is an American insurance company that is in the United States. The company also has personal lines insurance operations in Canada.

Data Source Kaggle: https://www.kaggle.com/c/allstate-claims-severity/data

Imports

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 

# random state
SEED=100
np.random.seed(SEED)

[(x.__name__,x.__version__) for x in [np,pd,sns]]
Out[4]:
[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]
In [65]:
%load_ext autoreload
%autoreload 2
In [66]:
from bhishan import bp

Load the data

In [5]:
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')


print(df_train_raw.shape)
df_train_raw.head(2)
(188318, 132)
Out[5]:
id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 ... cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss
0 1 A B A B A A A A B ... 0.718367 0.335060 0.30260 0.67135 0.83510 0.569745 0.594646 0.822493 0.714843 2213.18
1 2 A B A A A A A A B ... 0.438917 0.436585 0.60087 0.35127 0.43919 0.338312 0.366307 0.611431 0.304496 1283.60

2 rows × 132 columns

Count plots

In [10]:
cols_cat = [i for i in df_train_raw.columns
            if i.startswith('cat')]
len(cols_cat)
Out[10]:
116
In [14]:
n_cols = 4
n_rows = len(cols_cat)//4
In [16]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
In [17]:
for i in range(n_rows):
    fig,ax = plt.subplots(nrows=1,ncols=n_cols,
                         sharey=True,
                         figsize=(12,8))
    for j in range(n_cols):
        sns.countplot(x=cols_cat[i*n_cols+j],
                     data=df_train_raw,
                     ax=ax[j])
/Users/poudel/miniconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py:4: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  after removing the cwd from sys.path.

Correlation for Continuous Variables

In [18]:
df_corr = df_train_raw.corr()

df_corr.head()
Out[18]:
id cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss
id 1.000000 0.002130 0.000783 0.000816 0.002578 0.000775 0.001426 0.002209 0.005534 0.001870 0.002582 0.001075 0.000889 0.000442 -0.004924 -0.001122
cont1 0.002130 1.000000 -0.085180 -0.445431 0.367549 -0.025230 0.758315 0.367384 0.361163 0.929912 0.808551 0.596090 0.614225 0.534850 0.056688 -0.010237
cont2 0.000783 -0.085180 1.000000 0.455861 0.038693 0.191427 0.015864 0.048187 0.137468 -0.032729 0.063526 0.116824 0.106250 0.023335 -0.045584 0.141528
cont3 0.000816 -0.445431 0.455861 1.000000 -0.341633 0.089417 -0.349278 0.097516 -0.185432 -0.417054 -0.325562 0.025271 0.006111 -0.418203 -0.039592 0.111053
cont4 0.002578 0.367549 0.038693 -0.341633 1.000000 0.163748 0.220932 -0.115064 0.528740 0.328961 0.283294 0.120927 0.130453 0.179342 0.017445 -0.035831
In [49]:
def get_most_correlated_features(df,threshold=0.5):
    """Get the most correlated features above given threshold.
    
    Note: 
    1. Only numerical features have correlation.
    2. Here we only get absolute correlation.
    
    """

    df1 = (df.corr()
    .abs()
    .unstack()
    .sort_values(ascending=False)
    .reset_index()
    .rename(columns={'level_0':'feature1',
                    'level_1':'feature2',
                    0:'corr'})
    .query('feature1 != feature2')
    .assign(
    tmp = lambda dfx: dfx[['feature1', 'feature2']]\
                .apply(lambda x: '_'.join(sorted(tuple(x))),
                       axis=1)
        )
    .drop_duplicates('tmp')
    .drop('tmp',axis=1)
    .query('corr > @threshold')
    )

    return df1
In [51]:
df_corr_most = get_most_correlated_features(df_train_raw,threshold=0.5)

df_corr_most.style.background_gradient(subset=['corr'])
Out[51]:
feature1 feature2 corr
16 cont12 cont11 0.994384
18 cont9 cont1 0.929912
20 cont6 cont10 0.883351
22 cont13 cont6 0.815091
24 cont1 cont10 0.808551
26 cont9 cont6 0.797544
28 cont10 cont9 0.785697
30 cont12 cont6 0.785144
32 cont11 cont6 0.773745
34 cont6 cont1 0.758315
36 cont7 cont11 0.747108
38 cont7 cont12 0.742712
40 cont10 cont12 0.713812
42 cont13 cont10 0.707876
44 cont11 cont10 0.702896
46 cont7 cont6 0.658918
48 cont13 cont9 0.642028
50 cont9 cont12 0.626656
52 cont12 cont1 0.614225
54 cont9 cont11 0.608000
56 cont11 cont1 0.596090
58 cont1 cont13 0.534850
60 cont8 cont4 0.528740
In [56]:
for c1,c2 in zip(df_corr_most['feature1'],
                 df_corr_most['feature2']):
    sns.pairplot(df_train_raw, size=6,
                 x_vars=c1,y_vars=c2 )
    
    #sns.regplot(data=df_train_raw,x=c1,y=c2 )
    plt.show()
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/seaborn/axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code.
  warnings.warn(msg, UserWarning)

Continuous Variables

In [58]:
df_train_raw.skew().sort_values()
Out[58]:
cont2    -0.310941
cont3    -0.010002
id       -0.002155
cont14    0.248674
cont11    0.280821
cont12    0.291992
cont10    0.355001
cont13    0.380742
cont4     0.416096
cont6     0.461214
cont1     0.516424
cont8     0.676634
cont5     0.681622
cont7     0.826053
cont9     1.072429
loss      3.794958
dtype: float64
In [62]:
cols_cont = [i for i in df_train_raw.columns
            if i.startswith('cont')]

print(len(cols_cont), cols_cont)
14 ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']
In [63]:
n_cols = 2
n_rows = len(cols_cont)//n_cols

for i in range(n_rows):
    fg,ax = plt.subplots(nrows=1,ncols=n_cols,figsize=(12, 8))
    for j in range(n_cols):
        sns.violinplot(y=cols_cont[i*n_cols+j],
                       data=df_train_raw,
                       ax=ax[j])
In [ ]: