In this section I will explorte the statistics of the features. Most of the statistical analysis such as central tendencies calculations, Moment estimations and correlations are provided by excellent library pandas_profiling
but still we need to look for outliers using IQR method and KDE method.
The KDE method uses nonparametric way to estimate the outliers. It captures the outliers even in cases of bimodal distributions. So if a feature is not unimodal this method can be useful.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True,font_scale=1.5)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
# random state
SEED = 100
time_start_notebook = time.time()
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import scipy
from scipy import stats
import IPython
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
# Google colab
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
### mount google drive
from google.colab import drive
drive.mount('/content/drive')
### load the data dir
dat_dir = 'drive/My Drive/Colab Notebooks/data/'
sys.path.append(dat_dir)
### Image dir
img_dir = 'drive/My Drive/Colab Notebooks/images/'
if not os.path.isdir(img_dir): os.makedirs(img_dir)
sys.path.append(img_dir)
### Output dir
out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
if not os.path.isdir(out_dir): os.makedirs(out_dir)
sys.path.append(out_dir)
### Also install my custom module
module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/'
sys.path.append(module_dir)
!cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
!pip install -e bhishan
!cd -
### update pandas profiling
###profile = df_misc.profile_report(html={'style': {'full_width':True}})
###profile.to_file(out_dir + 'df_profile.html')
###profile.to_widgets() # not supported in Gcolab just use profile
!pip install -U pandas-profiling # we need restart
import pandas_profiling
#### print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import sklearn
import pandas_profiling
print([(x.__name__, x.__version__) for x in [sklearn, pandas_profiling]])
import bhishan
from bhishan import bp
print(bhishan.__version__)
%load_ext watermark
%watermark -a "Bhishan Poudel" -dvm
%watermark -iv
%load_ext autoreload
%autoreload 2
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999
df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/Prudential_Insurance/raw/train.csv.zip?raw=true',compression='zip')
print(df.shape)
df.head()
target = 'Response'
df_emp = df[df.columns[df.columns.str.startswith('Employment')]]
df_emp = df_emp.merge(df[target],
left_index=True,right_index=True)
print(df_emp.shape)
df_emp.head()
df_emp.bp.get_column_descriptions()
We can look at the overview of the data such as histogram, missing values, correlation, skewness using pandas_profiling module. Bear in mind that it may take long time to produce results. So saving them in a output file and only producing them once is a good way to go.
import pandas_profiling
import functools
pandas_profiling.__version__
try:
display(profile)
except NameError:
profile = df_emp.profile_report(explorative=True)
display(profile)
# profile.to_file('pandas_profile.html')
df_misc = df[['Ins_Age','Ht','Wt','BMI']]
df_misc.bp.plot_corr()
df_misc.merge(df[target],left_index=True,right_index=True).bp.plot_corr()
df.bp.get_high_correlated_features_df(thr=0.95,disp=True)
cols_high_corr = df.bp.get_high_correlated_features_list(thr=0.95,)
cols_high_corr
df.bp.partial_corr(['BMI','Response'])
We can use normal distribution IQR method to get the suspected outliers. We assume data is unimodal and get 25% and 75% percentile range as q1 and q3. Then the difference of q3 and q1 is called inter-quartile range IQR. We treat values 1.5*IQR below q1 or above q3 as outliers.
If the data is multimodal we can use KDE method outlier detection. We first scale the data and fit the univariate kde distribution statsmodels.nonparametric.kde.KDEUnivariate
to the data and get the predictions.
If the predictions probability is less than threshold 5% (0.05) we treat them as outliers.
ser_outliers = df.bp.outliers_tukey('Ins_Age')
ser_outliers
# col = 'Ins_Age'
# df1 = df.dropna(subset=[col]).reset_index(drop=True)
# idx_outliers, val_outliers = df1.bp.outliers_kde(col)
# df1.loc[idx_outliers,[col]]
df_emp_info1_outliers = df_emp.bp.plotly_boxplot('Employment_Info_1')
df['isResponse8'] = df['Response'] == 8
df.bp.compare_kde('Ins_Age','isResponse8')
df_resp8_0 = df.loc[df['isResponse8']==0]
df_resp8_1 = df.loc[df['isResponse8']==1]
target8 = "isResponse8"
plt.figure()
fig, axes = plt.subplots(figsize=(24,18))
x0 = df_resp8_0[col]
x1 = df_resp8_1[col]
sns.kdeplot(x0, bw=0.3,label=f"{target8} = 0",shade=1)
sns.kdeplot(x1, bw=0.3,label=f"{target8} = 1",shade=1)
plt.xlabel(col, fontsize=18)
plt.legend(loc='upper right',fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
from pandas.api.types import is_numeric_dtype
def compare_kde(df_,num,binn,figsize=(12,8),fontsize=14,
odir='images',
ofile=None,save=True,show=False):
"""Compare the KDE plots of two numerical features against binary target.
Parameters
-----------
df_: pandas.DataFrame
Input data.
num: str
Numerical feature.
binn: str
Binary feature.
figsize: (int,int)
Figure size.
fontsize: int
Size of x and y ticklabels.
odir: str
Name of output directory.
This directory will be created if it does not exist.
ofile: str
Base name of output image.
save: bool
Whether or not to save the image.
show: bool
Whether or not to show the image.
Examples
---------
.. code-block:: python
df = sns.load_dataset('titanic')
df.bp.compare_kde('fare','survived')
References
-----------
`stackoverflow <https://stackoverflow.com/questions/62375034/find-non-overlapping-area-between-two-kde-plots-in-python>`_
"""
df = df_[[num,binn]].dropna(how='any')
df_target_0 = df.loc[df[binn]==0]
df_target_1 = df.loc[df[binn]==1]
if not is_numeric_dtype(df[num]):
raise AttributeError(f'"{num}" must be a NUMERIC feature.')
if df[binn].nunique() != 2:
raise AttributeError(f'"{binn}" must be BINARY feature.')
x0 = df_target_0[col]
x1 = df_target_1[col]
kde0 = stats.gaussian_kde(x0, bw_method=0.3)
kde1 = stats.gaussian_kde(x1, bw_method=0.3)
xmin = min(x0.min(), x1.min())
xmax = min(x0.max(), x1.max())
dx = 0.2 * (xmax - xmin) # add a 20% margin,
# as the kde is wider than the data
xmin -= dx
xmax += dx
x = np.linspace(xmin, xmax, 500)
kde0_x = kde0(x)
kde1_x = kde1(x)
inters_x = np.minimum(kde0_x, kde1_x)
plt.plot(x, kde0_x, color='b', label='0')
plt.fill_between(x, kde0_x, 0, color='b', alpha=0.2)
plt.plot(x, kde1_x, color='orange', label='1')
plt.fill_between(x, kde1_x, 0, color='orange', alpha=0.2)
plt.plot(x, inters_x, color='r')
plt.fill_between(x, inters_x, 0, facecolor='none',
edgecolor='r', hatch='xx', label='intersection')
area_inters_x = np.trapz(inters_x, x)
handles, labels = plt.gca().get_legend_handles_labels()
labels[2] += f': {area_inters_x * 100:.1f} %'
plt.legend(handles, labels, title=binn)
plt.title(f'{num} vs {binn}')
plt.tight_layout()
if ofile:
# make sure this is base name
assert ofile == os.path.basename(ofile)
if not os.path.isdir(odir): os.makedirs(odir)
ofile = os.path.join(odir,ofile)
else:
if not os.path.isdir(odir): os.makedirs(odir)
ofile = os.path.join(odir,f'compare_kde_{num}_vs_{binn}.png')
if save: plt.savefig(ofile,dpi=300)
if show: plt.show(); plt.close()
compare_kde(df,'Ins_Age','isResponse8')