Description

In this section I will explorte the statistics of the features. Most of the statistical analysis such as central tendencies calculations, Moment estimations and correlations are provided by excellent library pandas_profiling but still we need to look for outliers using IQR method and KDE method.

The KDE method uses nonparametric way to estimate the outliers. It captures the outliers even in cases of bimodal distributions. So if a feature is not unimodal this method can be useful.

Imports

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True,font_scale=1.5)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
SEED = 100
time_start_notebook = time.time()

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

import scipy
from scipy import stats
import IPython
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
In [8]:
# Google colab
In [9]:
%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ### mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    ### load the data dir
    dat_dir = 'drive/My Drive/Colab Notebooks/data/'
    sys.path.append(dat_dir)

    ### Image dir
    img_dir = 'drive/My Drive/Colab Notebooks/images/'
    if not os.path.isdir(img_dir): os.makedirs(img_dir)
    sys.path.append(img_dir)

    ### Output dir
    out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
    if not os.path.isdir(out_dir): os.makedirs(out_dir)
    sys.path.append(out_dir)

    ### Also install my custom module
    module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/' 
    sys.path.append(module_dir)
    !cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
    !pip install -e bhishan
    !cd -

    ### update pandas profiling
    ###profile = df_misc.profile_report(html={'style': {'full_width':True}})
    ###profile.to_file(out_dir + 'df_profile.html')
    ###profile.to_widgets() # not supported in Gcolab just use profile
    !pip install -U pandas-profiling # we need restart
    import pandas_profiling

    #### print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.
In [10]:
import sklearn
import pandas_profiling

print([(x.__name__, x.__version__) for x in [sklearn, pandas_profiling]])
[('sklearn', '0.23.1'), ('pandas_profiling', '2.8.0')]
In [11]:
import bhishan
from bhishan import bp

print(bhishan.__version__)
0.3.1
In [12]:
%load_ext watermark
%watermark -a "Bhishan Poudel" -dvm
%watermark -iv
The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2020-06-22 

CPython 3.7.7
IPython 7.13.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
scipy            1.4.1
autopep8         1.5.2
seaborn          0.10.1
pandas           1.0.3
pandas_profiling 2.8.0
bhishan          0.3.1
json             2.0.9
sklearn          0.23.1
matplotlib       3.2.1
numpy            1.18.4
IPython          7.13.0

In [13]:
%load_ext autoreload
In [14]:
%autoreload 2
In [15]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

Load the data

In [16]:
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()
(59381, 128)
Out[16]:
Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4 Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht Wt BMI Employment_Info_1 Employment_Info_2 Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6 InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5 InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2 Insurance_History_3 Insurance_History_4 Insurance_History_5 Insurance_History_7 Insurance_History_8 Insurance_History_9 Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5 Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4 Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8 Medical_History_9 Medical_History_10 Medical_History_11 Medical_History_12 Medical_History_13 Medical_History_14 Medical_History_15 Medical_History_16 Medical_History_17 Medical_History_18 Medical_History_19 Medical_History_20 Medical_History_21 Medical_History_22 Medical_History_23 Medical_History_24 Medical_History_25 Medical_History_26 Medical_History_27 Medical_History_28 Medical_History_29 Medical_History_30 Medical_History_31 Medical_History_32 Medical_History_33 Medical_History_34 Medical_History_35 Medical_History_36 Medical_History_37 Medical_History_38 Medical_History_39 Medical_History_40 Medical_History_41 Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48 Response
0 2 1 D3 10 0.076923 2 1 1 0.641791 0.581818 0.148536 0.323008 0.028 12 1 0.0 3 NaN 1 2 6 3 1 2 1 1 1 3 1 0.000667 1 1 2 2 NaN 0.598039 NaN 0.526786 4.0 112 2 1 1 3 2 2 1 NaN 3 2 3 3 240.0 3 3 1 1 2 1 2 3 NaN 1 3 3 1 3 2 3 NaN 1 3 1 2 2 1 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8
1 5 1 A1 26 0.076923 2 3 1 0.059701 0.600000 0.131799 0.272288 0.000 1 3 0.0 2 0.0018 1 2 6 3 1 2 1 2 1 3 1 0.000133 1 3 2 2 0.188406 NaN 0.084507 NaN 5.0 412 2 1 1 3 2 2 1 NaN 3 2 3 3 0.0 1 3 1 1 2 1 2 3 NaN 1 3 3 1 3 2 3 NaN 3 1 1 2 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4
2 6 1 E1 26 0.076923 2 3 1 0.029851 0.745455 0.288703 0.428780 0.030 9 1 0.0 2 0.0300 1 2 8 3 1 1 1 2 1 1 3 NaN 3 2 3 3 0.304348 NaN 0.225352 NaN 10.0 3 2 2 1 3 2 2 2 NaN 3 2 3 3 NaN 1 3 1 1 2 1 2 3 NaN 2 2 3 1 3 2 3 NaN 3 3 1 3 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8
3 7 1 D4 10 0.487179 2 3 1 0.164179 0.672727 0.205021 0.352438 0.042 9 1 0.0 3 0.2000 2 2 8 3 1 2 1 2 1 1 3 NaN 3 2 3 3 0.420290 NaN 0.352113 NaN 0.0 350 2 2 1 3 2 2 2 NaN 3 2 3 3 NaN 1 3 1 1 2 2 2 3 NaN 1 3 3 1 3 2 3 NaN 3 3 1 2 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8
4 8 1 D2 26 0.230769 2 3 1 0.417910 0.654545 0.234310 0.424046 0.027 9 1 0.0 2 0.0500 1 2 6 3 1 2 1 2 1 1 3 NaN 3 2 3 2 0.463768 NaN 0.408451 NaN NaN 162 2 2 1 3 2 2 2 NaN 3 2 3 3 NaN 1 3 1 1 2 1 2 3 NaN 2 2 3 1 3 2 3 NaN 3 3 1 3 2 1 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8
In [17]:
target = 'Response'

Employment Features

In [18]:
df_emp = df[df.columns[df.columns.str.startswith('Employment')]]
df_emp = df_emp.merge(df[target],
                                      left_index=True,right_index=True)
print(df_emp.shape)
df_emp.head()
(59381, 7)
Out[18]:
Employment_Info_1 Employment_Info_2 Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6 Response
0 0.028 12 1 0.0 3 NaN 8
1 0.000 1 3 0.0 2 0.0018 4
2 0.030 9 1 0.0 2 0.0300 8
3 0.042 9 1 0.0 3 0.2000 8
4 0.027 9 1 0.0 2 0.0500 8
In [19]:
df_emp.bp.get_column_descriptions()
Out[19]:
Feature Type Count Unique Missing MissingPct Zeros ZerosPct count mean std min 25% 50% 75% max
5 Employment_Info_6 float64 59381 992 10854 18.280000 4042 6.810000 48527.000000 0.361469 0.349551 0.000000 0.060000 0.250000 0.550000 1.000000
3 Employment_Info_4 float64 59381 871 6779 11.420000 44659 75.210000 52602.000000 0.006283 0.032816 0.000000 0.000000 0.000000 0.000000 1.000000
0 Employment_Info_1 float64 59381 1936 19 0.030000 3688 6.210000 59362.000000 0.077582 0.082347 0.000000 0.035000 0.060000 0.100000 1.000000
1 Employment_Info_2 int64 59381 36 0 0.000000 0 0.000000 59381.000000 8.641821 4.227082 1.000000 9.000000 9.000000 9.000000 38.000000
2 Employment_Info_3 int64 59381 2 0 0.000000 0 0.000000 59381.000000 1.300904 0.715034 1.000000 1.000000 1.000000 1.000000 3.000000
4 Employment_Info_5 int64 59381 2 0 0.000000 0 0.000000 59381.000000 2.142958 0.350033 2.000000 2.000000 2.000000 2.000000 3.000000
6 Response int64 59381 8 0 0.000000 0 0.000000 59381.000000 5.636837 2.456833 1.000000 4.000000 6.000000 8.000000 8.000000

Dataframe Statistics Profiles

We can look at the overview of the data such as histogram, missing values, correlation, skewness using pandas_profiling module. Bear in mind that it may take long time to produce results. So saving them in a output file and only producing them once is a good way to go.

In [20]:
import pandas_profiling
import functools

pandas_profiling.__version__
Out[20]:
'2.8.0'
In [21]:
try:
    display(profile)
except NameError:
    profile = df_emp.profile_report(explorative=True)
    display(profile)




In [22]:
# profile.to_file('pandas_profile.html')

Correlations

In [32]:
df_misc = df[['Ins_Age','Ht','Wt','BMI']]
df_misc.bp.plot_corr()
In [33]:
df_misc.merge(df[target],left_index=True,right_index=True).bp.plot_corr()
In [34]:
df.bp.get_high_correlated_features_df(thr=0.95,disp=True)
Out[34]:
feature1 feature2 corr
0 Insurance_History_9 Insurance_History_7 0.962528
1 Medical_History_36 Medical_History_25 0.954110
2 Medical_History_37 Medical_Keyword_11 -0.950069
3 Medical_History_26 Medical_History_36 -0.965349
4 Insurance_History_7 Insurance_History_3 -0.974910
5 Insurance_History_9 Insurance_History_3 -0.982598
6 Medical_History_25 Medical_History_26 -0.987910
7 Medical_Keyword_23 Medical_History_33 -0.993030
8 Medical_Keyword_48 Medical_History_6 -0.993101
In [35]:
cols_high_corr = df.bp.get_high_correlated_features_list(thr=0.95,)
cols_high_corr
Out[35]:
{'Insurance_History_7',
 'Insurance_History_9',
 'Medical_History_26',
 'Medical_History_36',
 'Medical_Keyword_11',
 'Medical_Keyword_23',
 'Medical_Keyword_48'}

Partial Correlation

In [39]:
df.bp.partial_corr(['BMI','Response'])
Out[39]:
BMI Response
BMI 1.000000 -0.381601
Response -0.381601 1.000000

Outliers

We can use normal distribution IQR method to get the suspected outliers. We assume data is unimodal and get 25% and 75% percentile range as q1 and q3. Then the difference of q3 and q1 is called inter-quartile range IQR. We treat values 1.5*IQR below q1 or above q3 as outliers.

If the data is multimodal we can use KDE method outlier detection. We first scale the data and fit the univariate kde distribution statsmodels.nonparametric.kde.KDEUnivariate to the data and get the predictions. If the predictions probability is less than threshold 5% (0.05) we treat them as outliers.

In [40]:
ser_outliers = df.bp.outliers_tukey('Ins_Age')
ser_outliers
Out[40]:
Ins_Age
In [ ]:
# col = 'Ins_Age'
# df1 = df.dropna(subset=[col]).reset_index(drop=True)
# idx_outliers, val_outliers = df1.bp.outliers_kde(col)
# df1.loc[idx_outliers,[col]]
In [24]:
df_emp_info1_outliers = df_emp.bp.plotly_boxplot('Employment_Info_1')
None

Kernel Density

In [63]:
df['isResponse8'] = df['Response'] == 8
df.bp.compare_kde('Ins_Age','isResponse8')
/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan/pandas_api.py:33: UserWarning:

registration of accessor <class 'bhishan.pandas_api.BPAccessor'> under name 'bp' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexistingattribute with the same name.

Out[63]:
'Currently not available.'
In [60]:
df_resp8_0 = df.loc[df['isResponse8']==0]
df_resp8_1 = df.loc[df['isResponse8']==1]
target8 = "isResponse8"

plt.figure()
fig, axes = plt.subplots(figsize=(24,18))

x0 = df_resp8_0[col]
x1 = df_resp8_1[col]

sns.kdeplot(x0, bw=0.3,label=f"{target8} = 0",shade=1)
sns.kdeplot(x1, bw=0.3,label=f"{target8} = 1",shade=1)

plt.xlabel(col, fontsize=18)
plt.legend(loc='upper right',fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
<Figure size 432x288 with 0 Axes>
In [59]:
from pandas.api.types import is_numeric_dtype

def compare_kde(df_,num,binn,figsize=(12,8),fontsize=14,
                odir='images',
                ofile=None,save=True,show=False):
    """Compare the KDE plots of two numerical features against binary target.

    Parameters
    -----------
    df_: pandas.DataFrame
        Input data.
    num: str
        Numerical feature.
    binn: str
        Binary feature.
    figsize: (int,int)
        Figure size.
    fontsize: int
        Size of x and y ticklabels.
    odir: str
        Name of output directory.
        This directory will be created if it does not exist.
    ofile: str
        Base name of output image.
    save: bool
        Whether or not to save the image.
    show: bool
        Whether or not to show the image.

    Examples
    ---------
    .. code-block:: python

        df = sns.load_dataset('titanic')
        df.bp.compare_kde('fare','survived')

    References
    -----------

    `stackoverflow <https://stackoverflow.com/questions/62375034/find-non-overlapping-area-between-two-kde-plots-in-python>`_
    """
    df = df_[[num,binn]].dropna(how='any')

    df_target_0 = df.loc[df[binn]==0]
    df_target_1 = df.loc[df[binn]==1]

    if not is_numeric_dtype(df[num]):
        raise AttributeError(f'"{num}" must be a NUMERIC feature.')

    if df[binn].nunique() != 2:
        raise AttributeError(f'"{binn}" must be BINARY feature.')

    x0 = df_target_0[col]
    x1 = df_target_1[col]

    kde0 = stats.gaussian_kde(x0, bw_method=0.3)
    kde1 = stats.gaussian_kde(x1, bw_method=0.3)

    xmin = min(x0.min(), x1.min())
    xmax = min(x0.max(), x1.max())
    dx = 0.2 * (xmax - xmin)    # add a 20% margin,
                                # as the kde is wider than the data
    xmin -= dx
    xmax += dx

    x = np.linspace(xmin, xmax, 500)
    kde0_x = kde0(x)
    kde1_x = kde1(x)
    inters_x = np.minimum(kde0_x, kde1_x)

    plt.plot(x, kde0_x, color='b', label='0')
    plt.fill_between(x, kde0_x, 0, color='b', alpha=0.2)
    plt.plot(x, kde1_x, color='orange', label='1')
    plt.fill_between(x, kde1_x, 0, color='orange', alpha=0.2)
    plt.plot(x, inters_x, color='r')
    plt.fill_between(x, inters_x, 0, facecolor='none',
                    edgecolor='r', hatch='xx', label='intersection')

    area_inters_x = np.trapz(inters_x, x)

    handles, labels = plt.gca().get_legend_handles_labels()
    labels[2] += f': {area_inters_x * 100:.1f} %'
    plt.legend(handles, labels, title=binn)
    plt.title(f'{num} vs {binn}')
    plt.tight_layout()

    if ofile:
        # make sure this is base name
        assert ofile == os.path.basename(ofile)
        if not os.path.isdir(odir): os.makedirs(odir)
        ofile = os.path.join(odir,ofile)
    else:
        if not os.path.isdir(odir): os.makedirs(odir)
        ofile = os.path.join(odir,f'compare_kde_{num}_vs_{binn}.png')

    if save: plt.savefig(ofile,dpi=300)
    if show: plt.show(); plt.close()
        
compare_kde(df,'Ins_Age','isResponse8')
In [ ]: