Hints:
Hints:
Hints:
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import os,sys,time
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# modules
import missingno as msno
import pandas_profiling
from pandas_profiling import ProfileReport
from tqdm import tqdm
# modelling
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.smpickle import load_pickle
# model evaluation
import scikitplot
from scikitplot import metrics as skpmetrics
# settings
SEED = 100
pd.set_option('max_columns',100)
%matplotlib inline
%load_ext watermark
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark statsmodels.api 0.12.2 sklearn 0.23.1 pandas_profiling 2.11.0 pandas 1.3.0 numpy 1.19.5 scikitplot 0.3.7 missingno 0.4.2 seaborn 0.11.0
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
!ls ~/data
X_test_under_scaled.npz first_10.csv X_train_under_scaled.npz ser_dtype.csv df_under_features.csv ser_ytest_under.csv diabetes_project_dataset.csv ser_ytrain_under.csv diabetes_project_dataset.hdf5
df_features = pd.read_csv(os.path.expanduser("~/data/df_under_features.csv"))
df_features.head(2)
0 | |
---|---|
0 | diabetes_time |
1 | age |
features = df_features.iloc[:,0].to_list()
features[:5], len(features)
(['diabetes_time', 'age', 'male', 'BMI', 'HDL'], 94)
%%time
# read numpy pickled files
p = os.path.expanduser("~/data/X_train_under_scaled.npz")
X_train_under_scaled = np.load(p)['data']
p = os.path.expanduser("~/data/X_test_under_scaled.npz")
X_test_under_scaled = np.load(p)['data']
X_train_under_scaled.shape, X_test_under_scaled.shape
CPU times: user 8.1 ms, sys: 1.31 ms, total: 9.41 ms Wall time: 9.18 ms
((1126, 94), (282, 94))
# read pandas series
ser_ytrain_under = pd.read_csv(os.path.expanduser("~/data/ser_ytrain_under.csv"))
ser_ytrain_under = ser_ytrain_under.set_index("index")
ser_ytest_under = pd.read_csv(os.path.expanduser("~/data/ser_ytest_under.csv"))
ser_ytest_under = ser_ytest_under.set_index("index")
ser_ytrain_under.head()
incident_diabetes | |
---|---|
index | |
658 | 0 |
1250 | 1 |
655 | 0 |
809 | 1 |
483 | 0 |
X_train = np.c_[np.ones(len(X_train_under_scaled)), X_train_under_scaled]
X_test = np.c_[np.ones(len(X_test_under_scaled)), X_test_under_scaled]
X_train.shape, X_test.shape
((1126, 95), (282, 95))
model = sm.Logit(ser_ytrain_under,X_train)
model_fit = model.fit()
summary = model_fit.summary()
Optimization terminated successfully. Current function value: 0.228698 Iterations 9
df_results = pd.DataFrame({'feature': ['constant'] + features, 'pvalue': model_fit.pvalues})
df_results.head()
feature | pvalue | |
---|---|---|
const | constant | 1.655085e-09 |
x1 | diabetes_time | 5.913692e-44 |
x2 | age | 1.241021e-01 |
x3 | male | 4.863915e-01 |
x4 | BMI | 1.287418e-06 |
# sort by pvalues
df_results = df_results.sort_values('pvalue')
df_results.head()
feature | pvalue | |
---|---|---|
x1 | diabetes_time | 5.913692e-44 |
const | constant | 1.655085e-09 |
x4 | BMI | 1.287418e-06 |
x10 | hypertension | 7.084951e-06 |
x30 | mtb_1834574 | 2.645495e-04 |
# select only feature that have p-value < 0.05
df_res1 = df_results.query("pvalue < 0.05")
print(df_res1.shape)
df_res1
(12, 2)
feature | pvalue | |
---|---|---|
x1 | diabetes_time | 5.913692e-44 |
const | constant | 1.655085e-09 |
x4 | BMI | 1.287418e-06 |
x10 | hypertension | 7.084951e-06 |
x30 | mtb_1834574 | 2.645495e-04 |
x70 | mtb_18407 | 4.456362e-04 |
x91 | mtb_18578 | 2.398839e-03 |
x68 | mtb_18402 | 1.358803e-02 |
x49 | mtb_18296 | 1.365069e-02 |
x32 | mtb_606773 | 2.080227e-02 |
x31 | mtb_1050860 | 3.088292e-02 |
x23 | mtb_1091716 | 4.695975e-02 |
print(df_res1['feature'].to_list())
['diabetes_time', 'constant', 'BMI', 'hypertension', 'mtb_1834574', 'mtb_18407', 'mtb_18578', 'mtb_18402', 'mtb_18296', 'mtb_606773', 'mtb_1050860', 'mtb_1091716']
y_prob1d = model_fit.predict(X_test)
y_prob1d[:5]
array([0.9992048 , 0.00844896, 0.12970245, 0.4824638 , 0.99814307])
y_pred = (y_prob1d > 0.5).astype(np.int8)
y_pred[:5]
array([1, 0, 0, 0, 1], dtype=int8)
y_test = np.array(ser_ytest_under).ravel()
y_test[:5]
array([1, 0, 0, 0, 1])
skpmetrics.plot_confusion_matrix(y_test,y_pred);
bp.show_methods(skpmetrics,contains='curve')
0 | 1 | 2 | |
---|---|---|---|
0 | binary_ks_curve | plot_calibration_curve | plot_roc_curve |
1 | calibration_curve | plot_lift_curve | precision_recall_curve |
2 | cumulative_gain_curve | plot_precision_recall_curve | roc_curve |
y_prob2d = np.c_[1-y_prob1d, y_prob1d]
skpmetrics.plot_roc(y_test, y_prob2d );
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 8 secs