import time
time_start_notebook = time.time()


import numpy as np
import pandas as pd
import os,sys,time

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# modules
import missingno as msno
import pandas_profiling
from pandas_profiling import ProfileReport
from tqdm import tqdm

# modelling
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.smpickle import load_pickle

# model evaluation
import scikitplot
from scikitplot import metrics as skpmetrics

# settings
SEED = 100
pd.set_option('max_columns',100)

%matplotlib inline
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
statsmodels.api  0.12.2
sklearn          0.23.1
pandas_profiling 2.11.0
pandas           1.3.0
numpy            1.19.5
scikitplot       0.3.7
missingno        0.4.2
seaborn          0.11.0


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


!ls ~/data

X_test_under_scaled.npz       first_10.csv
X_train_under_scaled.npz      ser_dtype.csv
df_under_features.csv         ser_ytest_under.csv
diabetes_project_dataset.csv  ser_ytrain_under.csv
diabetes_project_dataset.hdf5


df_features = pd.read_csv(os.path.expanduser("~/data/df_under_features.csv"))
df_features.head(2)


features = df_features.iloc[:,0].to_list()
features[:5], len(features)

(['diabetes_time', 'age', 'male', 'BMI', 'HDL'], 94)


%%time

# read numpy pickled files
p = os.path.expanduser("~/data/X_train_under_scaled.npz")
X_train_under_scaled = np.load(p)['data']

p = os.path.expanduser("~/data/X_test_under_scaled.npz")
X_test_under_scaled = np.load(p)['data']

X_train_under_scaled.shape, X_test_under_scaled.shape

CPU times: user 8.1 ms, sys: 1.31 ms, total: 9.41 ms
Wall time: 9.18 ms

((1126, 94), (282, 94))


# read pandas series
ser_ytrain_under = pd.read_csv(os.path.expanduser("~/data/ser_ytrain_under.csv"))
ser_ytrain_under = ser_ytrain_under.set_index("index")

ser_ytest_under = pd.read_csv(os.path.expanduser("~/data/ser_ytest_under.csv"))
ser_ytest_under = ser_ytest_under.set_index("index")

ser_ytrain_under.head()


X_train = np.c_[np.ones(len(X_train_under_scaled)), X_train_under_scaled]
X_test  = np.c_[np.ones(len(X_test_under_scaled)), X_test_under_scaled]

X_train.shape, X_test.shape

((1126, 95), (282, 95))


model = sm.Logit(ser_ytrain_under,X_train)

model_fit = model.fit()
summary = model_fit.summary()

Optimization terminated successfully.
         Current function value: 0.228698
         Iterations 9


df_results = pd.DataFrame({'feature': ['constant'] + features, 'pvalue': model_fit.pvalues})
df_results.head()


# sort by pvalues
df_results = df_results.sort_values('pvalue')
df_results.head()


# select only feature that have p-value < 0.05
df_res1 = df_results.query("pvalue < 0.05")

print(df_res1.shape)
df_res1

(12, 2)


print(df_res1['feature'].to_list())

['diabetes_time', 'constant', 'BMI', 'hypertension', 'mtb_1834574', 'mtb_18407', 'mtb_18578', 'mtb_18402', 'mtb_18296', 'mtb_606773', 'mtb_1050860', 'mtb_1091716']


y_prob1d = model_fit.predict(X_test)
y_prob1d[:5]

array([0.9992048 , 0.00844896, 0.12970245, 0.4824638 , 0.99814307])


y_pred = (y_prob1d > 0.5).astype(np.int8)
y_pred[:5]

array([1, 0, 0, 0, 1], dtype=int8)


y_test = np.array(ser_ytest_under).ravel()
y_test[:5]

array([1, 0, 0, 0, 1])


skpmetrics.plot_confusion_matrix(y_test,y_pred);


bp.show_methods(skpmetrics,contains='curve')


y_prob2d = np.c_[1-y_prob1d, y_prob1d]
skpmetrics.plot_roc(y_test, y_prob2d );


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 8 secs

	incident_diabetes
index
658	0
1250	1
655	0
809	1
483	0

	feature	pvalue
const	constant	1.655085e-09
x1	diabetes_time	5.913692e-44
x2	age	1.241021e-01
x3	male	4.863915e-01
x4	BMI	1.287418e-06

	feature	pvalue
x1	diabetes_time	5.913692e-44
const	constant	1.655085e-09
x4	BMI	1.287418e-06
x10	hypertension	7.084951e-06
x30	mtb_1834574	2.645495e-04

	feature	pvalue
x1	diabetes_time	5.913692e-44
const	constant	1.655085e-09
x4	BMI	1.287418e-06
x10	hypertension	7.084951e-06
x30	mtb_1834574	2.645495e-04
x70	mtb_18407	4.456362e-04
x91	mtb_18578	2.398839e-03
x68	mtb_18402	1.358803e-02
x49	mtb_18296	1.365069e-02
x32	mtb_606773	2.080227e-02
x31	mtb_1050860	3.088292e-02
x23	mtb_1091716	4.695975e-02

	0	1	2
0	binary_ks_curve	plot_calibration_curve	plot_roc_curve
1	calibration_curve	plot_lift_curve	precision_recall_curve
2	cumulative_gain_curve	plot_precision_recall_curve	roc_curve

Table of Contents

Description¶

Question 01¶

Quesiton 02¶

Question 03¶

Import the modules¶

Load the data¶

Modelling logistic regression using statsmodels¶

Get p-values¶

Model Evaluation¶

Time Taken¶