The Allstate Corporation is an American insurance company that is in the United States. The company also has personal lines insurance operations in Canada.
Data Source Kaggle: https://www.kaggle.com/c/allstate-claims-severity/data
import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# random state
SEED=100
np.random.seed(SEED)
[(x.__name__,x.__version__) for x in [np,pd,sns]]
from scipy.special import boxcox1p
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')
print(df_train.shape)
df_train.head(2)
df_train.shape, df_test.shape
df_train.columns.difference(df_test.columns)
s = df_train.skew()
s
# boxcox transform high skewed variables
threshold = 0.25
cols_hight_skew = s[s.abs()>threshold].index.tolist()
print(cols_hight_skew)
from scipy.special import boxcox1p
cols_cont = [i for i in df_train.columns
if i.startswith('cont')]
print(cols_cont)
threshold = 0.25
for c in cols_cont:
df_train[c+'_boxcox1p'] = boxcox1p(df_train[c].to_numpy(),
threshold)
df_test[c+'_boxcox1p'] = boxcox1p(df_test[c].to_numpy(),
threshold)
df_train['loss_log1p'] = np.log1p(df_train['loss'])
cols_cat = [i for i in df_train.columns
if i.startswith('cat')]
print(cols_cat)
df_train[cols_cat].nunique()[lambda x: x>10]
large_cats = df_train[cols_cat].nunique()[lambda x: x>50].index.tolist()
print(large_cats)
small_cats = df_train[cols_cat].nunique()[lambda x: x<=50].index.tolist()
df_train[large_cats[0]].value_counts(normalize=True).mul(100)[lambda x: x>0.1]
# very few categories have 99.9% of the values.
cat = large_cats[0]
idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1].index.tolist()
cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'
cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'
df_train[cat].nunique()
# another cat
cat = large_cats[1]
# display(df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1])
idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1].index.tolist()
cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'
cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'
df_train[cat].nunique()
cat = large_cats[2]
# display(df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1])
idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1].index.tolist()
cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'
cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'
df_train[cat].nunique()
cat = large_cats[3]
# display(df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1])
idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.1].index.tolist()
cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'
cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'
df_train[cat].nunique()
cat = large_cats[4]
# display(df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.5])
idx_keep = df_train[cat].value_counts(normalize=True).mul(100)[lambda x: x>0.5].index.tolist()
cond_train = df_train[cat].isin(idx_keep)
df_train.loc[~cond_train,cat] = 'Others'
cond_test = df_test[cat].isin(idx_keep)
df_test.loc[~cond_test,cat] = 'Others'
df_train[cat].nunique()
df_dummies_train = pd.concat([
pd.get_dummies(df_train[cat],prefix='dummy_'+cat)
for cat in cols_cat
], axis=1)
df_dummies_train.iloc[:2,:2]
df_dummies_test = pd.concat([
pd.get_dummies(df_test[cat],prefix='dummy_'+cat)
for cat in cols_cat
], axis=1)
df_train = pd.concat([df_train, df_dummies_train ],axis=1)
df_test = pd.concat([df_test, df_dummies_test ],axis=1)
df_train.shape, df_test.shape
df_train.columns.difference(df_test.columns)
df_train.to_csv('../data/processed/train_cleaned_encoded.csv')
df_test.to_csv('../data/processed/test_cleaned_encoded.csv')
!ls ../data/processed/
!du -sh ../data/processed/train_cleaned_encoded.csv