Description

The Allstate Corporation is an American insurance company that is in the United States. The company also has personal lines insurance operations in Canada.

Data Source Kaggle: https://www.kaggle.com/c/allstate-claims-severity/data

Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot') 

# random state
SEED=100
np.random.seed(SEED)

[(x.__name__,x.__version__) for x in [np,pd,sns]]
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
Out[1]:
[('numpy', '1.18.1'), ('pandas', '1.0.1'), ('seaborn', '0.9.0')]
In [2]:
from scipy.special import boxcox1p

Load the data

In [3]:
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')

print(df_train.shape)
df_train.head(2)
(188318, 132)
Out[3]:
id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 ... cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss
0 1 A B A B A A A A B ... 0.718367 0.335060 0.30260 0.67135 0.83510 0.569745 0.594646 0.822493 0.714843 2213.18
1 2 A B A A A A A A B ... 0.438917 0.436585 0.60087 0.35127 0.43919 0.338312 0.366307 0.611431 0.304496 1283.60

2 rows × 132 columns

In [4]:
df_train.shape, df_test.shape
Out[4]:
((188318, 132), (125546, 131))
In [5]:
df_train.columns.difference(df_test.columns)
Out[5]:
Index(['loss'], dtype='object')

Data Manipulation

Transform skewed numerical features

In [6]:
s = df_train.skew()
s
Out[6]:
id       -0.002155
cont1     0.516424
cont2    -0.310941
cont3    -0.010002
cont4     0.416096
cont5     0.681622
cont6     0.461214
cont7     0.826053
cont8     0.676634
cont9     1.072429
cont10    0.355001
cont11    0.280821
cont12    0.291992
cont13    0.380742
cont14    0.248674
loss      3.794958
dtype: float64
In [7]:
# boxcox transform high skewed variables
threshold = 0.25
cols_hight_skew = s[s.abs()>threshold].index.tolist()

print(cols_hight_skew)
['cont1', 'cont2', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'loss']
In [8]:
from scipy.special import boxcox1p
In [9]:
cols_cont = [i for i in df_train.columns 
             if i.startswith('cont')]
print(cols_cont)

threshold = 0.25
for c in cols_cont:
    df_train[c+'_boxcox1p'] = boxcox1p(df_train[c].to_numpy(),
                                       threshold)
    df_test[c+'_boxcox1p'] = boxcox1p(df_test[c].to_numpy(),
                                       threshold)
['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']

Log transform Target

In [10]:
df_train['loss_log1p'] = np.log1p(df_train['loss'])

Categorical features

In [11]:
cols_cat = [i for i in df_train.columns
           if i.startswith('cat')]

print(cols_cat)
['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116']
In [12]:
df_train[cols_cat].nunique()[lambda x: x>10]
Out[12]:
cat99      16
cat100     15
cat101     19
cat103     13
cat104     17
cat105     20
cat106     17
cat107     20
cat108     11
cat109     84
cat110    131
cat111     16
cat112     51
cat113     61
cat114     19
cat115     23
cat116    326
dtype: int64
In [13]:
large_cats = df_train[cols_cat].nunique()[lambda x: x>50].index.tolist()

print(large_cats)
['cat109', 'cat110', 'cat112', 'cat113', 'cat116']
In [14]:
small_cats = df_train[cols_cat].nunique()[lambda x: x<=50].index.tolist()
In [15]:
df_train[large_cats[0]].value_counts(normalize=True).mul(100)[lambda x: x>0.1]
Out[15]:
BI    81.202009
AB    11.646789
BU     1.668454
K      1.592519
G      0.718466
BQ     0.566595
N      0.244799
M      0.236833
BO     0.174704
BH     0.136471
D      0.113638
AR     0.110451
AT     0.106203
Name: cat109, dtype: float64
In [16]:
# very few categories have 99.9% of the values.
In [17]:
cat = large_cats[0]
idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1].index.tolist()

cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'

cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'

df_train[cat].nunique()
Out[17]:
14
In [18]:
# another cat
In [19]:
cat = large_cats[1]

# display(df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1])

idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1].index.tolist()


cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'

cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'

df_train[cat].nunique()
Out[19]:
40
In [20]:
cat = large_cats[2]

# display(df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1])

idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1].index.tolist()


cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'

cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'

df_train[cat].nunique()
Out[20]:
50
In [21]:
cat = large_cats[3]

# display(df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1])

idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1].index.tolist()

cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'

cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'

df_train[cat].nunique()
Out[21]:
42
In [22]:
cat = large_cats[4]

# display(df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.5])

idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.5].index.tolist()

cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'

cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'

df_train[cat].nunique()
Out[22]:
37
In [23]:
df_dummies_train = pd.concat([
    pd.get_dummies(df_train[cat],prefix='dummy_'+cat)
    for cat in cols_cat
], axis=1)

df_dummies_train.iloc[:2,:2]
Out[23]:
dummy_cat1_A dummy_cat1_B
0 1 0
1 1 0
In [24]:
df_dummies_test = pd.concat([
    pd.get_dummies(df_test[cat],prefix='dummy_'+cat)
    for cat in cols_cat
], axis=1)
In [25]:
df_train = pd.concat([df_train, df_dummies_train ],axis=1)
df_test = pd.concat([df_test, df_dummies_test ],axis=1)
In [26]:
df_train.shape, df_test.shape
Out[26]:
((188318, 816), (125546, 811))
In [27]:
df_train.columns.difference(df_test.columns)
Out[27]:
Index(['dummy_cat101_N', 'dummy_cat101_U', 'dummy_cat102_H', 'dummy_cat102_J',
       'dummy_cat105_R', 'dummy_cat105_S', 'dummy_cat111_D', 'dummy_cat114_X',
       'dummy_cat89_I', 'dummy_cat90_G', 'dummy_cat92_F', 'loss',
       'loss_log1p'],
      dtype='object')

Save the clean data

In [28]:
df_train.to_csv('../data/processed/train_cleaned_encoded.csv')
df_test.to_csv('../data/processed/test_cleaned_encoded.csv')
In [29]:
!ls ../data/processed/
test_cleaned_encoded.csv  train_cleaned_encoded.csv
In [30]:
!du -sh ../data/processed/train_cleaned_encoded.csv
369M	../data/processed/train_cleaned_encoded.csv
In [ ]: