This is an example tutorial to use my module bhishan for the plotly extension for pandas.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%load_ext autoreload
%load_ext watermark
%autoreload 2
%watermark -a "Bhishan Poudel" -d -v -m
%watermark -iv
Bhishan Poudel 2021-07-11 CPython 3.7.7 IPython 7.22.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit seaborn 0.11.0 pandas 1.2.4 numpy 1.19.5
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
# print(sns.get_dataset_names())
df = sns.load_dataset('titanic')
df.head()
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
df.bp.freq(['embarked'],style=True)
embarked | Count | Percent | Cumulative Count | Cumulative Percent | |
---|---|---|---|---|---|
0 | S | 644 | 72.44% | 644 | 72.44% |
1 | C | 168 | 18.90% | 812 | 91.34% |
2 | Q | 77 | 8.66% | 889 | 100.00% |
df.bp.describe()
Feature | Type | N | Count | Unique | Missing | MissingPct | Zeros | ZerosPct | Ones | OnesPct | mean | std | min | max | 25% | 50% | 75% | Feature2 | smallest5 | largest5 | first5 | last5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11 | deck | category | 891 | 203 | 7 | 688 | 77.22 | 0 | 0.00 | 0 | 0.00 | deck | ['A', 'A', 'A', 'A', 'A'] | ['G', 'G', 'G', 'G', 'F'] | [nan, 'C', nan, 'C', nan] | [nan, 'B', nan, 'C', nan] | |||||||
3 | age | float64 | 891 | 714 | 88 | 177 | 19.87 | 0 | 0.00 | 7 | 0.79 | 29.70 | 14.53 | 0.42 | 80.00 | 20.12 | 28.00 | 38.00 | age | [0.42, 0.67, 0.75, 0.75, 0.83] | [80.0, 74.0, 71.0, 71.0, 70.5] | [22.0, 38.0, 26.0, 35.0, 35.0] | [27.0, 19.0, nan, 26.0, 32.0] |
7 | embarked | object | 891 | 889 | 3 | 2 | 0.22 | 0 | 0.00 | 0 | 0.00 | embarked | ['C', 'C', 'C', 'C', 'C'] | ['S', 'S', 'S', 'S', 'S'] | ['S', 'C', 'S', 'S', 'S'] | ['S', 'S', 'S', 'C', 'Q'] | |||||||
12 | embark_town | object | 891 | 889 | 3 | 2 | 0.22 | 0 | 0.00 | 0 | 0.00 | embark_town | ['Cherbourg', 'Cherbourg', 'Cherbourg', 'Cherbourg', 'Cherbourg'] | ['Southampton', 'Southampton', 'Southampton', 'Southampton', 'Southampton'] | ['Southampton', 'Cherbourg', 'Southampton', 'Southampton', 'Southampton'] | ['Southampton', 'Southampton', 'Southampton', 'Cherbourg', 'Queenstown'] | |||||||
5 | parch | int64 | 891 | 891 | 7 | 0 | 0.00 | 678 | 76.09 | 118 | 13.24 | 0.38 | 0.81 | 0.00 | 6.00 | 0.00 | 0.00 | 0.00 | parch | [0, 0, 0, 0, 0] | [6, 5, 5, 5, 5] | [0, 0, 0, 0, 0] | [0, 0, 2, 0, 0] |
4 | sibsp | int64 | 891 | 891 | 7 | 0 | 0.00 | 608 | 68.24 | 209 | 23.46 | 0.52 | 1.10 | 0.00 | 8.00 | 0.00 | 0.00 | 1.00 | sibsp | [0, 0, 0, 0, 0] | [8, 8, 8, 8, 8] | [1, 1, 0, 1, 0] | [0, 0, 1, 0, 0] |
0 | survived | int64 | 891 | 891 | 2 | 0 | 0.00 | 549 | 61.62 | 342 | 38.38 | 0.38 | 0.49 | 0.00 | 1.00 | 0.00 | 0.00 | 1.00 | survived | [0, 0, 0, 0, 0] | [1, 1, 1, 1, 1] | [0, 1, 1, 1, 0] | [0, 1, 0, 1, 0] |
10 | adult_male | bool | 891 | 891 | 2 | 0 | 0.00 | 354 | 39.73 | 537 | 60.27 | adult_male | [False, False, False, False, False] | [True, True, True, True, True] | [True, False, False, False, True] | [True, False, False, True, True] | |||||||
14 | alone | bool | 891 | 891 | 2 | 0 | 0.00 | 354 | 39.73 | 537 | 60.27 | alone | [False, False, False, False, False] | [True, True, True, True, True] | [False, False, True, False, True] | [True, True, False, True, True] | |||||||
6 | fare | float64 | 891 | 891 | 248 | 0 | 0.00 | 15 | 1.68 | 0 | 0.00 | 32.20 | 49.69 | 0.00 | 512.33 | 7.91 | 14.45 | 31.00 | fare | [0.0, 0.0, 0.0, 0.0, 0.0] | [512.3292, 512.3292, 512.3292, 263.0, 263.0] | [7.25, 71.2833, 7.925, 53.1, 8.05] | [13.0, 30.0, 23.45, 30.0, 7.75] |
1 | pclass | int64 | 891 | 891 | 3 | 0 | 0.00 | 0 | 0.00 | 216 | 24.24 | 2.31 | 0.84 | 1.00 | 3.00 | 2.00 | 3.00 | 3.00 | pclass | [1, 1, 1, 1, 1] | [3, 3, 3, 3, 3] | [3, 1, 3, 1, 3] | [2, 1, 3, 1, 3] |
2 | sex | object | 891 | 891 | 2 | 0 | 0.00 | 0 | 0.00 | 0 | 0.00 | sex | ['female', 'female', 'female', 'female', 'female'] | ['male', 'male', 'male', 'male', 'male'] | ['male', 'female', 'female', 'female', 'male'] | ['male', 'female', 'female', 'male', 'male'] | |||||||
8 | class | category | 891 | 891 | 3 | 0 | 0.00 | 0 | 0.00 | 0 | 0.00 | class | ['First', 'First', 'First', 'First', 'First'] | ['Third', 'Third', 'Third', 'Third', 'Third'] | ['Third', 'First', 'Third', 'First', 'Third'] | ['Second', 'First', 'Third', 'First', 'Third'] | |||||||
9 | who | object | 891 | 891 | 3 | 0 | 0.00 | 0 | 0.00 | 0 | 0.00 | who | ['child', 'child', 'child', 'child', 'child'] | ['woman', 'woman', 'woman', 'woman', 'woman'] | ['man', 'woman', 'woman', 'woman', 'man'] | ['man', 'woman', 'woman', 'man', 'man'] | |||||||
13 | alive | object | 891 | 891 | 2 | 0 | 0.00 | 0 | 0.00 | 0 | 0.00 | alive | ['no', 'no', 'no', 'no', 'no'] | ['yes', 'yes', 'yes', 'yes', 'yes'] | ['no', 'yes', 'yes', 'yes', 'no'] | ['no', 'yes', 'no', 'yes', 'no'] |
df.bp.get_duplicate_columns()
[]
df.dtypes
survived int64 pclass int64 sex object age float64 sibsp int64 parch int64 fare float64 embarked object class object who object adult_male bool deck object embark_town object alive object alone bool dtype: object
df1 = pd.DataFrame({'a': range(3),'b':range(1,4),'a_dup':range(3)})
df1
a | b | a_dup | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 1 | 2 | 1 |
2 | 2 | 3 | 2 |
df1.bp.get_duplicate_columns()
a == a_dup
['a_dup']
df.bp.missing()
Missing values high threshold = 80% Number of missing values features: 4 cols_missing_high = [] cols_missing_low = ['deck', 'age', 'embarked', 'embark_town']
Feature | Type | Count | Missing | Zeros | Unique | MissingPct | ZerosPct | count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11 | deck | object | 891 | 688 | 0 | 7 | 77.216611 | 0.000000 | ||||||||
3 | age | float64 | 891 | 177 | 0 | 88 | 19.865320 | 0.000000 | 714.000000 | 29.699118 | 14.526497 | 0.420000 | 20.125000 | 28.000000 | 38.000000 | 80.000000 |
7 | embarked | object | 891 | 2 | 0 | 3 | 0.224467 | 0.000000 | ||||||||
12 | embark_town | object | 891 | 2 | 0 | 3 | 0.224467 | 0.000000 |