Hints:
Hints:
Hints:
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import os,sys,time
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# modules
import missingno as msno
import pandas_profiling
from pandas_profiling import ProfileReport
from tqdm import tqdm
# modelling
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.smpickle import load_pickle
# model evaluation
import scikitplot
from scikitplot import metrics as skpmetrics
# settings
SEED = 100
pd.set_option('max_columns',100)
%matplotlib inline
%load_ext watermark
%watermark -iv
seaborn 0.11.0 missingno 0.4.2 scikitplot 0.3.7 numpy 1.19.5 statsmodels.api 0.12.2 pandas_profiling 2.11.0 pandas 1.3.0 sklearn 0.23.1
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
!ls ~/data
X_test_under_scaled.npz first_10.csv X_train_under_scaled.npz ser_dtype.csv df_under_features.csv ser_ytest_under.csv diabetes_project_dataset.csv ser_ytrain_under.csv diabetes_project_dataset.hdf5
df_features = pd.read_csv(os.path.expanduser("~/data/df_under_features.csv"))
df_features.head(2)
0 | |
---|---|
0 | diabetes_time |
1 | age |
features = df_features.iloc[:,0].to_list()
features[:5], len(features)
(['diabetes_time', 'age', 'male', 'BMI', 'HDL'], 94)
%%time
# read numpy pickled files
p = os.path.expanduser("~/data/X_train_under_scaled.npz")
X_train_under_scaled = np.load(p)['data']
p = os.path.expanduser("~/data/X_test_under_scaled.npz")
X_test_under_scaled = np.load(p)['data']
X_train_under_scaled.shape, X_test_under_scaled.shape
CPU times: user 9.76 ms, sys: 3.34 ms, total: 13.1 ms Wall time: 23.7 ms
((1126, 94), (282, 94))
# read pandas series
ser_ytrain_under = pd.read_csv(os.path.expanduser("~/data/ser_ytrain_under.csv"))
ser_ytrain_under = ser_ytrain_under.set_index("index")
ser_ytest_under = pd.read_csv(os.path.expanduser("~/data/ser_ytest_under.csv"))
ser_ytest_under = ser_ytest_under.set_index("index")
ser_ytrain_under.head()
incident_diabetes | |
---|---|
index | |
658 | 0 |
1250 | 1 |
655 | 0 |
809 | 1 |
483 | 0 |
print(features)
['diabetes_time', 'age', 'male', 'BMI', 'HDL', 'LDL', 'trig', 'SBP', 'DBP', 'hypertension', 'fasting', 'current_smoker', 'ex_smoker', 'exercise', 'healthy_vegetables', 'junk_food', 'total_fiber', 'mtb_1368087', 'mtb_1380093', 'mtb_1812369', 'mtb_1838668', 'mtb_1042362', 'mtb_1091716', 'mtb_1228672', 'mtb_1542487', 'mtb_1272352', 'mtb_1391826', 'mtb_1435571', 'mtb_1521753', 'mtb_1834574', 'mtb_1050860', 'mtb_606773', 'mtb_638620', 'mtb_752773', 'mtb_590255', 'mtb_709794', 'mtb_352255', 'mtb_509192', 'mtb_1230298', 'mtb_841524', 'mtb_1957718', 'mtb_1937123', 'mtb_1940724', 'mtb_18238', 'mtb_18261', 'mtb_18262', 'mtb_18266', 'mtb_18274', 'mtb_18296', 'mtb_18299', 'mtb_18323', 'mtb_18324', 'mtb_18325', 'mtb_18326', 'mtb_18327', 'mtb_18333', 'mtb_18350', 'mtb_18351', 'mtb_18359', 'mtb_18362', 'mtb_18364', 'mtb_18365', 'mtb_18382', 'mtb_18385', 'mtb_18386', 'mtb_18389', 'mtb_18398', 'mtb_18402', 'mtb_18406', 'mtb_18407', 'mtb_18415', 'mtb_18423', 'mtb_18440', 'mtb_18464', 'mtb_18468', 'mtb_18470', 'mtb_18477', 'mtb_18486', 'mtb_18488', 'mtb_18491', 'mtb_18496', 'mtb_18500', 'mtb_18509', 'mtb_18521', 'mtb_18536', 'mtb_18546', 'mtb_18555', 'mtb_18559', 'mtb_18566', 'mtb_18569', 'mtb_18578', 'mtb_18594', 'mtb_18601', 'mtb_18607']
cols_mtb = [i for i in features if i.startswith('mtb')]
cols_mtb[:2], len(cols_mtb)
(['mtb_1368087', 'mtb_1380093'], 77)
num_exclude = len(features) - len(cols_mtb)
num_exclude
17
len(features[num_exclude:])
77
X_train = np.c_[np.ones(len(X_train_under_scaled)), X_train_under_scaled[:, num_exclude:]]
X_test = np.c_[np.ones(len(X_test_under_scaled)), X_test_under_scaled[:,num_exclude:]]
X_train.shape, X_test.shape
((1126, 78), (282, 78))
model = sm.Logit(ser_ytrain_under,X_train)
model_fit = model.fit()
summary = model_fit.summary()
Optimization terminated successfully. Current function value: 0.618346 Iterations 7
df_results = pd.DataFrame({'feature': ['constant'] + features[num_exclude:],
'pvalue': model_fit.pvalues})
df_results.head()
feature | pvalue | |
---|---|---|
const | constant | 0.292524 |
x1 | mtb_1368087 | 0.007168 |
x2 | mtb_1380093 | 0.154740 |
x3 | mtb_1812369 | 0.656889 |
x4 | mtb_1838668 | 0.663524 |
# sort by pvalues
df_results = df_results.sort_values('pvalue')
df_results.head()
feature | pvalue | |
---|---|---|
x59 | mtb_18470 | 0.004471 |
x15 | mtb_606773 | 0.006752 |
x1 | mtb_1368087 | 0.007168 |
x21 | mtb_509192 | 0.007968 |
x22 | mtb_1230298 | 0.008953 |
# select only feature that have p-value < 0.05
df_res1 = df_results.query("pvalue < 0.05")
print(df_res1.shape)
df_res1
(13, 2)
feature | pvalue | |
---|---|---|
x59 | mtb_18470 | 0.004471 |
x15 | mtb_606773 | 0.006752 |
x1 | mtb_1368087 | 0.007168 |
x21 | mtb_509192 | 0.007968 |
x22 | mtb_1230298 | 0.008953 |
x57 | mtb_18464 | 0.009283 |
x10 | mtb_1391826 | 0.010141 |
x11 | mtb_1435571 | 0.012980 |
x18 | mtb_590255 | 0.014789 |
x26 | mtb_1940724 | 0.020747 |
x7 | mtb_1228672 | 0.037823 |
x40 | mtb_18350 | 0.039221 |
x38 | mtb_18327 | 0.040931 |
print(df_res1['feature'].to_list())
['mtb_18470', 'mtb_606773', 'mtb_1368087', 'mtb_509192', 'mtb_1230298', 'mtb_18464', 'mtb_1391826', 'mtb_1435571', 'mtb_590255', 'mtb_1940724', 'mtb_1228672', 'mtb_18350', 'mtb_18327']
y_prob1d = model_fit.predict(X_test)
y_prob1d[:5]
array([0.32742499, 0.16508122, 0.65117447, 0.5311954 , 0.8595927 ])
y_pred = (y_prob1d > 0.5).astype(np.int8)
y_pred[:5]
array([0, 0, 1, 1, 1], dtype=int8)
y_test = np.array(ser_ytest_under).ravel()
y_test[:5]
array([1, 0, 0, 0, 1])
skpmetrics.plot_confusion_matrix(y_test,y_pred);
bp.show_methods(skpmetrics,contains='curve')
0 | 1 | 2 | |
---|---|---|---|
0 | binary_ks_curve | plot_calibration_curve | plot_roc_curve |
1 | calibration_curve | plot_lift_curve | precision_recall_curve |
2 | cumulative_gain_curve | plot_precision_recall_curve | roc_curve |
y_prob2d = np.c_[1-y_prob1d, y_prob1d]
skpmetrics.plot_roc(y_test, y_prob2d );
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 8 min 42 secs