import time
time_start_notebook = time.time()


import numpy as np
import pandas as pd
import os,sys,time

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# modules
import missingno as msno
import pandas_profiling
from pandas_profiling import ProfileReport
from tqdm import tqdm

# modelling
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.smpickle import load_pickle

# model evaluation
import scikitplot
from scikitplot import metrics as skpmetrics

# settings
SEED = 100
pd.set_option('max_columns',100)

%matplotlib inline
%load_ext watermark
%watermark -iv

seaborn          0.11.0
missingno        0.4.2
scikitplot       0.3.7
numpy            1.19.5
statsmodels.api  0.12.2
pandas_profiling 2.11.0
pandas           1.3.0
sklearn          0.23.1


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


!ls ~/data

X_test_under_scaled.npz       first_10.csv
X_train_under_scaled.npz      ser_dtype.csv
df_under_features.csv         ser_ytest_under.csv
diabetes_project_dataset.csv  ser_ytrain_under.csv
diabetes_project_dataset.hdf5


df_features = pd.read_csv(os.path.expanduser("~/data/df_under_features.csv"))
df_features.head(2)


features = df_features.iloc[:,0].to_list()
features[:5], len(features)

(['diabetes_time', 'age', 'male', 'BMI', 'HDL'], 94)


%%time

# read numpy pickled files
p = os.path.expanduser("~/data/X_train_under_scaled.npz")
X_train_under_scaled = np.load(p)['data']

p = os.path.expanduser("~/data/X_test_under_scaled.npz")
X_test_under_scaled = np.load(p)['data']

X_train_under_scaled.shape, X_test_under_scaled.shape

CPU times: user 9.76 ms, sys: 3.34 ms, total: 13.1 ms
Wall time: 23.7 ms

((1126, 94), (282, 94))


# read pandas series
ser_ytrain_under = pd.read_csv(os.path.expanduser("~/data/ser_ytrain_under.csv"))
ser_ytrain_under = ser_ytrain_under.set_index("index")

ser_ytest_under = pd.read_csv(os.path.expanduser("~/data/ser_ytest_under.csv"))
ser_ytest_under = ser_ytest_under.set_index("index")

ser_ytrain_under.head()


print(features)

['diabetes_time', 'age', 'male', 'BMI', 'HDL', 'LDL', 'trig', 'SBP', 'DBP', 'hypertension', 'fasting', 'current_smoker', 'ex_smoker', 'exercise', 'healthy_vegetables', 'junk_food', 'total_fiber', 'mtb_1368087', 'mtb_1380093', 'mtb_1812369', 'mtb_1838668', 'mtb_1042362', 'mtb_1091716', 'mtb_1228672', 'mtb_1542487', 'mtb_1272352', 'mtb_1391826', 'mtb_1435571', 'mtb_1521753', 'mtb_1834574', 'mtb_1050860', 'mtb_606773', 'mtb_638620', 'mtb_752773', 'mtb_590255', 'mtb_709794', 'mtb_352255', 'mtb_509192', 'mtb_1230298', 'mtb_841524', 'mtb_1957718', 'mtb_1937123', 'mtb_1940724', 'mtb_18238', 'mtb_18261', 'mtb_18262', 'mtb_18266', 'mtb_18274', 'mtb_18296', 'mtb_18299', 'mtb_18323', 'mtb_18324', 'mtb_18325', 'mtb_18326', 'mtb_18327', 'mtb_18333', 'mtb_18350', 'mtb_18351', 'mtb_18359', 'mtb_18362', 'mtb_18364', 'mtb_18365', 'mtb_18382', 'mtb_18385', 'mtb_18386', 'mtb_18389', 'mtb_18398', 'mtb_18402', 'mtb_18406', 'mtb_18407', 'mtb_18415', 'mtb_18423', 'mtb_18440', 'mtb_18464', 'mtb_18468', 'mtb_18470', 'mtb_18477', 'mtb_18486', 'mtb_18488', 'mtb_18491', 'mtb_18496', 'mtb_18500', 'mtb_18509', 'mtb_18521', 'mtb_18536', 'mtb_18546', 'mtb_18555', 'mtb_18559', 'mtb_18566', 'mtb_18569', 'mtb_18578', 'mtb_18594', 'mtb_18601', 'mtb_18607']


cols_mtb = [i for i in features if i.startswith('mtb')]
cols_mtb[:2], len(cols_mtb)

(['mtb_1368087', 'mtb_1380093'], 77)


num_exclude = len(features) - len(cols_mtb)
num_exclude

17


len(features[num_exclude:])

77


X_train = np.c_[np.ones(len(X_train_under_scaled)), X_train_under_scaled[:, num_exclude:]]
X_test  = np.c_[np.ones(len(X_test_under_scaled)), X_test_under_scaled[:,num_exclude:]]

X_train.shape, X_test.shape

((1126, 78), (282, 78))


model = sm.Logit(ser_ytrain_under,X_train)

model_fit = model.fit()
summary = model_fit.summary()

Optimization terminated successfully.
         Current function value: 0.618346
         Iterations 7


df_results = pd.DataFrame({'feature': ['constant'] + features[num_exclude:],
                           'pvalue': model_fit.pvalues})
df_results.head()


# sort by pvalues
df_results = df_results.sort_values('pvalue')
df_results.head()


# select only feature that have p-value < 0.05
df_res1 = df_results.query("pvalue < 0.05")

print(df_res1.shape)
df_res1

(13, 2)


print(df_res1['feature'].to_list())

['mtb_18470', 'mtb_606773', 'mtb_1368087', 'mtb_509192', 'mtb_1230298', 'mtb_18464', 'mtb_1391826', 'mtb_1435571', 'mtb_590255', 'mtb_1940724', 'mtb_1228672', 'mtb_18350', 'mtb_18327']


y_prob1d = model_fit.predict(X_test)
y_prob1d[:5]

array([0.32742499, 0.16508122, 0.65117447, 0.5311954 , 0.8595927 ])


y_pred = (y_prob1d > 0.5).astype(np.int8)
y_pred[:5]

array([0, 0, 1, 1, 1], dtype=int8)


y_test = np.array(ser_ytest_under).ravel()
y_test[:5]

array([1, 0, 0, 0, 1])


skpmetrics.plot_confusion_matrix(y_test,y_pred);


bp.show_methods(skpmetrics,contains='curve')


y_prob2d = np.c_[1-y_prob1d, y_prob1d]
skpmetrics.plot_roc(y_test, y_prob2d );


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 8 min 42 secs

	incident_diabetes
index
658	0
1250	1
655	0
809	1
483	0

	feature	pvalue
const	constant	0.292524
x1	mtb_1368087	0.007168
x2	mtb_1380093	0.154740
x3	mtb_1812369	0.656889
x4	mtb_1838668	0.663524

	feature	pvalue
x59	mtb_18470	0.004471
x15	mtb_606773	0.006752
x1	mtb_1368087	0.007168
x21	mtb_509192	0.007968
x22	mtb_1230298	0.008953

	feature	pvalue
x59	mtb_18470	0.004471
x15	mtb_606773	0.006752
x1	mtb_1368087	0.007168
x21	mtb_509192	0.007968
x22	mtb_1230298	0.008953
x57	mtb_18464	0.009283
x10	mtb_1391826	0.010141
x11	mtb_1435571	0.012980
x18	mtb_590255	0.014789
x26	mtb_1940724	0.020747
x7	mtb_1228672	0.037823
x40	mtb_18350	0.039221
x38	mtb_18327	0.040931

	0	1	2
0	binary_ks_curve	plot_calibration_curve	plot_roc_curve
1	calibration_curve	plot_lift_curve	precision_recall_curve
2	cumulative_gain_curve	plot_precision_recall_curve	roc_curve

Table of Contents

Description¶

Question 01¶

Quesiton 02¶

Question 03¶

Import the modules¶

Load the data¶

Select only biomarkers¶

Modelling logistic regression using statsmodels¶

Get p-values¶

Model Evaluation¶

Time Taken¶

	0
0	diabetes_time
1	age