import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%load_ext watermark

%autoreload 2
%watermark -a "Bhishan Poudel" -d -v -m
%watermark -iv

Bhishan Poudel 2021-07-11 

CPython 3.7.7
IPython 7.22.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
pandas  1.2.4
seaborn 0.11.0
numpy   1.19.5


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


df = sns.load_dataset('titanic')
df.head()


# detailed description using my module bp
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


df.bp.describe()


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


target = 'survived'
features = ['pclass','age','sibsp','parch','fare','alone']
df = df[features+[target]].dropna()

X = df[features].astype(np.float32).values
y = df[target]

Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,random_state=100,stratify=y)


clf_lr = LogisticRegression(random_state=100)
clf_lr.fit(Xtrain,ytrain)

ypreds_lr = clf_lr.predict(Xtest)


from bhishan import bp
from sklearn import metrics as skmetrics


# get a nice dataframe with various scaler model evaluation metrics
df_eval = bp.get_binary_classification_scalar_metrics(
    "Logistic Regression",
    clf_lr,
    Xtest,ytest,
    ypreds_lr,
    desc="Default", df_eval=None)


# get classification report
df_clf_report = bp.get_binary_classification_report("Logistic Regression",
  ytest,ypreds_lr,
  desc='',
  style_col='Recall_1',
  df_clf_report=None)


print(skmetrics.classification_report(ytest,ypreds_lr))

              precision    recall  f1-score   support

           0       0.74      0.88      0.80       106
           1       0.75      0.55      0.63        73

    accuracy                           0.74       179
   macro avg       0.75      0.71      0.72       179
weighted avg       0.74      0.74      0.73       179


df_out = pd.DataFrame({'ytest': ytest, 'ypreds': ypreds_lr})
df_out.head()


pd.crosstab(df_out['ytest'],df_out['ypreds'])


cm = skmetrics.confusion_matrix(ytest,ypreds_lr)
print(cm)

[[93 13]
 [33 40]]


df_cm = bp.print_confusion_matrix('Logistic Regression', ytest,ypreds_lr,
                                  zero='Alive',one='Dead')


bp.plot_confusion_matrix_plotly(ytest,ypreds_lr,labels=['Alive','Dead'])


yprobs = clf_lr.predict_proba(Xtest)
yprobs[:5]

array([[0.31097563, 0.68902437],
       [0.68147744, 0.31852256],
       [0.52856291, 0.47143709],
       [0.79029474, 0.20970526],
       [0.86451932, 0.13548068]])


# help(bp.show_methods)


bp.show_methods(bp,contains='auc')


ytest.shape, yprobs.shape

((179,), (179, 2))


bp.plot_roc_auc(ytest,yprobs)


# using scikitplot


from scikitplot import metrics as skpmetrics


bp.show_methods(skpmetrics,starts='plot')


skpmetrics.plot_roc(ytest,yprobs)

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

	Feature	Type	N	Count	Unique	Missing	MissingPct	Zeros	ZerosPct	Ones	OnesPct	mean	std	min	max	25%	50%	75%	Feature2	smallest5	largest5	first5	last5
11	deck	category	891	203	7	688	77.22	0	0.00	0	0.00								deck	['A', 'A', 'A', 'A', 'A']	['G', 'G', 'G', 'G', 'F']	[nan, 'C', nan, 'C', nan]	[nan, 'B', nan, 'C', nan]
3	age	float64	891	714	88	177	19.87	0	0.00	7	0.79	29.70	14.53	0.42	80.00	20.12	28.00	38.00	age	[0.42, 0.67, 0.75, 0.75, 0.83]	[80.0, 74.0, 71.0, 71.0, 70.5]	[22.0, 38.0, 26.0, 35.0, 35.0]	[27.0, 19.0, nan, 26.0, 32.0]
7	embarked	object	891	889	3	2	0.22	0	0.00	0	0.00								embarked	['C', 'C', 'C', 'C', 'C']	['S', 'S', 'S', 'S', 'S']	['S', 'C', 'S', 'S', 'S']	['S', 'S', 'S', 'C', 'Q']
12	embark_town	object	891	889	3	2	0.22	0	0.00	0	0.00								embark_town	['Cherbourg', 'Cherbourg', 'Cherbourg', 'Cherbourg', 'Cherbourg']	['Southampton', 'Southampton', 'Southampton', 'Southampton', 'Southampton']	['Southampton', 'Cherbourg', 'Southampton', 'Southampton', 'Southampton']	['Southampton', 'Southampton', 'Southampton', 'Cherbourg', 'Queenstown']
5	parch	int64	891	891	7	0	0.00	678	76.09	118	13.24	0.38	0.81	0.00	6.00	0.00	0.00	0.00	parch	[0, 0, 0, 0, 0]	[6, 5, 5, 5, 5]	[0, 0, 0, 0, 0]	[0, 0, 2, 0, 0]
4	sibsp	int64	891	891	7	0	0.00	608	68.24	209	23.46	0.52	1.10	0.00	8.00	0.00	0.00	1.00	sibsp	[0, 0, 0, 0, 0]	[8, 8, 8, 8, 8]	[1, 1, 0, 1, 0]	[0, 0, 1, 0, 0]
0	survived	int64	891	891	2	0	0.00	549	61.62	342	38.38	0.38	0.49	0.00	1.00	0.00	0.00	1.00	survived	[0, 0, 0, 0, 0]	[1, 1, 1, 1, 1]	[0, 1, 1, 1, 0]	[0, 1, 0, 1, 0]
10	adult_male	bool	891	891	2	0	0.00	354	39.73	537	60.27								adult_male	[False, False, False, False, False]	[True, True, True, True, True]	[True, False, False, False, True]	[True, False, False, True, True]
14	alone	bool	891	891	2	0	0.00	354	39.73	537	60.27								alone	[False, False, False, False, False]	[True, True, True, True, True]	[False, False, True, False, True]	[True, True, False, True, True]
6	fare	float64	891	891	248	0	0.00	15	1.68	0	0.00	32.20	49.69	0.00	512.33	7.91	14.45	31.00	fare	[0.0, 0.0, 0.0, 0.0, 0.0]	[512.3292, 512.3292, 512.3292, 263.0, 263.0]	[7.25, 71.2833, 7.925, 53.1, 8.05]	[13.0, 30.0, 23.45, 30.0, 7.75]
1	pclass	int64	891	891	3	0	0.00	0	0.00	216	24.24	2.31	0.84	1.00	3.00	2.00	3.00	3.00	pclass	[1, 1, 1, 1, 1]	[3, 3, 3, 3, 3]	[3, 1, 3, 1, 3]	[2, 1, 3, 1, 3]
2	sex	object	891	891	2	0	0.00	0	0.00	0	0.00								sex	['female', 'female', 'female', 'female', 'female']	['male', 'male', 'male', 'male', 'male']	['male', 'female', 'female', 'female', 'male']	['male', 'female', 'female', 'male', 'male']
8	class	category	891	891	3	0	0.00	0	0.00	0	0.00								class	['First', 'First', 'First', 'First', 'First']	['Third', 'Third', 'Third', 'Third', 'Third']	['Third', 'First', 'Third', 'First', 'Third']	['Second', 'First', 'Third', 'First', 'Third']
9	who	object	891	891	3	0	0.00	0	0.00	0	0.00								who	['child', 'child', 'child', 'child', 'child']	['woman', 'woman', 'woman', 'woman', 'woman']	['man', 'woman', 'woman', 'woman', 'man']	['man', 'woman', 'woman', 'man', 'man']
13	alive	object	891	891	2	0	0.00	0	0.00	0	0.00								alive	['no', 'no', 'no', 'no', 'no']	['yes', 'yes', 'yes', 'yes', 'yes']	['no', 'yes', 'yes', 'yes', 'no']	['no', 'yes', 'no', 'yes', 'no']

	0	1	2
0	plot_calibration_curve	plot_lift_curve	plot_roc
1	plot_confusion_matrix	plot_precision_recall	plot_roc_curve
2	plot_cumulative_gain	plot_precision_recall_curve	plot_silhouette
3	plot_ks_statistic

Table of Contents

Imports¶

Load data¶

Modelling¶

Model Evaluation using module "bp"¶

Accuracy Precision and Recall¶

Classificaion Report¶

Confusion Matrix¶

Area Under the Curve¶

	Predicted_Alive	Predicted_Dead	Total_Alive	Correct_Alive	Incorrect_Alive	Alive_Detection	Total_Dead	Correct_Dead	Incorrect_Dead	Dead_Detection
Alive	93	13	106	93	13	87.74%	73	40	33	54.79%
Dead	33	40	106	93	13	87.74%	73	40	33	54.79%

	ytest	ypreds
9	1	1
374	0	0
733	0	0
821	1	0
528	0	0

ypreds	0	1
ytest
0	93	13
1	33	40