This is an example tutorial to use my module bhishan for the plotly extension for pandas.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%load_ext autoreload
%load_ext watermark
%autoreload 2
%watermark -a "Bhishan Poudel" -d -v -m
%watermark -iv
Bhishan Poudel 2021-07-11 CPython 3.7.7 IPython 7.22.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit seaborn 0.11.0 numpy 1.19.5 pandas 1.2.4
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
# print(sns.get_dataset_names())
df = sns.load_dataset('titanic')
df.head()
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
df.bp.plot_corr(xrot=0)
df.bp.plot_corr(target='survived')
target = 'survived'
df_few_cols = df[['age','fare']]
df_few_cols.merge(df[target],left_index=True,right_index=True)\
.bp.plot_corr()
df.bp.plot_corr_style()
survived | pclass | age | sibsp | parch | fare | adult_male | alone | |
---|---|---|---|---|---|---|---|---|
survived | 1.00 | -0.34 | -0.08 | -0.04 | 0.08 | 0.26 | -0.56 | -0.20 |
pclass | -0.34 | 1.00 | -0.37 | 0.08 | 0.02 | -0.55 | 0.09 | 0.14 |
age | -0.08 | -0.37 | 1.00 | -0.31 | -0.19 | 0.10 | 0.28 | 0.20 |
sibsp | -0.04 | 0.08 | -0.31 | 1.00 | 0.41 | 0.16 | -0.25 | -0.58 |
parch | 0.08 | 0.02 | -0.19 | 0.41 | 1.00 | 0.22 | -0.35 | -0.58 |
fare | 0.26 | -0.55 | 0.10 | 0.16 | 0.22 | 1.00 | -0.18 | -0.27 |
adult_male | -0.56 | 0.09 | 0.28 | -0.25 | -0.35 | -0.18 | 1.00 | 0.40 |
alone | -0.20 | 0.14 | 0.20 | -0.58 | -0.58 | -0.27 | 0.40 | 1.00 |
# df.bp.plot_corr_sns()
df.corr().style.apply(lambda x: ["background: salmon" if (abs(v) > 0.5 and v!=1) else "" for v in x], axis = 1)
survived | pclass | age | sibsp | parch | fare | adult_male | alone | |
---|---|---|---|---|---|---|---|---|
survived | 1.000000 | -0.338481 | -0.077221 | -0.035322 | 0.081629 | 0.257307 | -0.557080 | -0.203367 |
pclass | -0.338481 | 1.000000 | -0.369226 | 0.083081 | 0.018443 | -0.549500 | 0.094035 | 0.135207 |
age | -0.077221 | -0.369226 | 1.000000 | -0.308247 | -0.189119 | 0.096067 | 0.280328 | 0.198270 |
sibsp | -0.035322 | 0.083081 | -0.308247 | 1.000000 | 0.414838 | 0.159651 | -0.253586 | -0.584471 |
parch | 0.081629 | 0.018443 | -0.189119 | 0.414838 | 1.000000 | 0.216225 | -0.349943 | -0.583398 |
fare | 0.257307 | -0.549500 | 0.096067 | 0.159651 | 0.216225 | 1.000000 | -0.182024 | -0.271832 |
adult_male | -0.557080 | 0.094035 | 0.280328 | -0.253586 | -0.349943 | -0.182024 | 1.000000 | 0.404744 |
alone | -0.203367 | 0.135207 | 0.198270 | -0.584471 | -0.583398 | -0.271832 | 0.404744 | 1.000000 |
df.bp.corr_high(thr=0.5,disp=True)
cols_high_corr = ['pclass', 'adult_male', 'sibsp', 'fare', 'survived', 'alone', 'parch'] cols_high_corr1 = ['fare', 'survived', 'parch', 'alone'] cols_high_corr2 = ['pclass', 'adult_male', 'alone', 'sibsp'] cols_high_corr_drop = ['fare', 'parch', 'survived']
feature1 | feature2 | corr | |
---|---|---|---|
0 | fare | pclass | -0.549500 |
1 | survived | adult_male | -0.557080 |
2 | parch | alone | -0.583398 |
3 | alone | sibsp | -0.584471 |
cols_high_corr_drop = ['fare', 'parch', 'survived']
df1 = df.drop(cols_high_corr_drop,axis=1)
df1.corr()
pclass | age | sibsp | adult_male | alone | |
---|---|---|---|---|---|
pclass | 1.000000 | -0.369226 | 0.083081 | 0.094035 | 0.135207 |
age | -0.369226 | 1.000000 | -0.308247 | 0.280328 | 0.198270 |
sibsp | 0.083081 | -0.308247 | 1.000000 | -0.253586 | -0.584471 |
adult_male | 0.094035 | 0.280328 | -0.253586 | 1.000000 | 0.404744 |
alone | 0.135207 | 0.198270 | -0.584471 | 0.404744 | 1.000000 |
df.bp.partial_corr(thr=0.3,disp=True)
survived | pclass | age | sibsp | parch | fare | |
---|---|---|---|---|---|---|
survived | 1.000000 | 0.115371 | 0.111542 | -0.028674 | 0.082447 | 0.301210 |
pclass | 0.115371 | 1.000000 | 0.571910 | 0.306844 | 0.173285 | -0.315652 |
age | 0.111542 | 0.571910 | 1.000000 | -0.192800 | -0.095105 | 0.392652 |
sibsp | -0.028674 | 0.306844 | -0.192800 | 1.000000 | 0.324787 | 0.211403 |
parch | 0.082447 | 0.173285 | -0.095105 | 0.324787 | 1.000000 | 0.199136 |
fare | 0.301210 | -0.315652 | 0.392652 | 0.211403 | 0.199136 | 1.000000 |
df.bp.partial_corr(['fare','age'])
fare | age | |
---|---|---|
fare | 1.000000 | 0.096067 |
age | 0.096067 | 1.000000 |
ser_outliers = df.bp.outliers_tukey('age')
ser_outliers
age | |
---|---|
33 | 66.0 |
54 | 65.0 |
96 | 71.0 |
116 | 70.5 |
280 | 65.0 |
456 | 65.0 |
493 | 71.0 |
630 | 80.0 |
672 | 70.0 |
745 | 70.0 |
851 | 74.0 |
col = 'age'
df1 = df.dropna(subset=[col]).reset_index(drop=True)
idx_outliers, val_outliers = df1.bp.outliers_kde(col)
df1.loc[idx_outliers,[col]]
age | |
---|---|
498 | 80.0 |
679 | 74.0 |
74 | 71.0 |
393 | 71.0 |
91 | 70.5 |
531 | 70.0 |
592 | 70.0 |
25 | 66.0 |
366 | 65.0 |
225 | 65.0 |
40 | 65.0 |
351 | 64.0 |
433 | 64.0 |
385 | 63.0 |
221 | 63.0 |
df.bp.compare_kde_binn('age','survived')