The Allstate Corporation is an American insurance company that is in the United States. The company also has personal lines insurance operations in Canada.
Data Source Kaggle: https://www.kaggle.com/c/allstate-claims-severity/data
import numpy as np
import pandas as pd
import seaborn as sns
pd.plotting.register_matplotlib_converters()
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# random state
SEED=100
np.random.seed(SEED)
[(x.__name__,x.__version__) for x in [np,pd,sns]]
%load_ext autoreload
%autoreload 2
from bhishan import bp
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')
print(df_train_raw.shape)
df_train_raw.head(2)
cols_cat = [i for i in df_train_raw.columns
if i.startswith('cat')]
len(cols_cat)
n_cols = 4
n_rows = len(cols_cat)//4
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
for i in range(n_rows):
fig,ax = plt.subplots(nrows=1,ncols=n_cols,
sharey=True,
figsize=(12,8))
for j in range(n_cols):
sns.countplot(x=cols_cat[i*n_cols+j],
data=df_train_raw,
ax=ax[j])
df_corr = df_train_raw.corr()
df_corr.head()
def get_most_correlated_features(df,threshold=0.5):
"""Get the most correlated features above given threshold.
Note:
1. Only numerical features have correlation.
2. Here we only get absolute correlation.
"""
df1 = (df.corr()
.abs()
.unstack()
.sort_values(ascending=False)
.reset_index()
.rename(columns={'level_0':'feature1',
'level_1':'feature2',
0:'corr'})
.query('feature1 != feature2')
.assign(
tmp = lambda dfx: dfx[['feature1', 'feature2']]\
.apply(lambda x: '_'.join(sorted(tuple(x))),
axis=1)
)
.drop_duplicates('tmp')
.drop('tmp',axis=1)
.query('corr > @threshold')
)
return df1
df_corr_most = get_most_correlated_features(df_train_raw,threshold=0.5)
df_corr_most.style.background_gradient(subset=['corr'])
for c1,c2 in zip(df_corr_most['feature1'],
df_corr_most['feature2']):
sns.pairplot(df_train_raw, size=6,
x_vars=c1,y_vars=c2 )
#sns.regplot(data=df_train_raw,x=c1,y=c2 )
plt.show()
df_train_raw.skew().sort_values()
cols_cont = [i for i in df_train_raw.columns
if i.startswith('cont')]
print(len(cols_cont), cols_cont)
n_cols = 2
n_rows = len(cols_cont)//n_cols
for i in range(n_rows):
fg,ax = plt.subplots(nrows=1,ncols=n_cols,figsize=(12, 8))
for j in range(n_cols):
sns.violinplot(y=cols_cont[i*n_cols+j],
data=df_train_raw,
ax=ax[j])