import time
time_start_notebook = time.time()


import numpy as np
import pandas as pd
import os,sys,time

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# modules
import missingno as msno
import pandas_profiling
from pandas_profiling import ProfileReport
from tqdm import tqdm

# modelling
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.smpickle import load_pickle

# model evaluation
import scikitplot
from scikitplot import metrics as skpmetrics
import collections

# settings
SEED = 100
pd.set_option('max_columns',100)

%matplotlib inline
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
sklearn          0.23.1
statsmodels.api  0.12.2
pandas_profiling 2.11.0
seaborn          0.11.0
pandas           1.3.0
scikitplot       0.3.7
numpy            1.19.5
missingno        0.4.2


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


!ls ~/data

X_test_under_scaled.npz       first_10.csv
X_train_under_scaled.npz      ser_dtype.csv
df_under_features.csv         ser_ytest_under.csv
diabetes_project_dataset.csv  ser_ytrain_under.csv
diabetes_project_dataset.hdf5


df_features = pd.read_csv(os.path.expanduser("~/data/df_under_features.csv"))
df_features.head(2)


features = df_features.iloc[:,0].to_list()
features[:5], len(features)

(['diabetes_time', 'age', 'male', 'BMI', 'HDL'], 94)


%%time

# read numpy pickled files
p = os.path.expanduser("~/data/X_train_under_scaled.npz")
X_train_under_scaled = np.load(p)['data']

p = os.path.expanduser("~/data/X_test_under_scaled.npz")
X_test_under_scaled = np.load(p)['data']

X_train_under_scaled.shape, X_test_under_scaled.shape

CPU times: user 8.77 ms, sys: 2.32 ms, total: 11.1 ms
Wall time: 18.7 ms

((1126, 94), (282, 94))


# read pandas series
ser_ytrain_under = pd.read_csv(os.path.expanduser("~/data/ser_ytrain_under.csv"))
ser_ytrain_under = ser_ytrain_under.set_index("index")

ser_ytest_under = pd.read_csv(os.path.expanduser("~/data/ser_ytest_under.csv"))
ser_ytest_under = ser_ytest_under.set_index("index")

ser_ytrain_under.head()


print(features)

['diabetes_time', 'age', 'male', 'BMI', 'HDL', 'LDL', 'trig', 'SBP', 'DBP', 'hypertension', 'fasting', 'current_smoker', 'ex_smoker', 'exercise', 'healthy_vegetables', 'junk_food', 'total_fiber', 'mtb_1368087', 'mtb_1380093', 'mtb_1812369', 'mtb_1838668', 'mtb_1042362', 'mtb_1091716', 'mtb_1228672', 'mtb_1542487', 'mtb_1272352', 'mtb_1391826', 'mtb_1435571', 'mtb_1521753', 'mtb_1834574', 'mtb_1050860', 'mtb_606773', 'mtb_638620', 'mtb_752773', 'mtb_590255', 'mtb_709794', 'mtb_352255', 'mtb_509192', 'mtb_1230298', 'mtb_841524', 'mtb_1957718', 'mtb_1937123', 'mtb_1940724', 'mtb_18238', 'mtb_18261', 'mtb_18262', 'mtb_18266', 'mtb_18274', 'mtb_18296', 'mtb_18299', 'mtb_18323', 'mtb_18324', 'mtb_18325', 'mtb_18326', 'mtb_18327', 'mtb_18333', 'mtb_18350', 'mtb_18351', 'mtb_18359', 'mtb_18362', 'mtb_18364', 'mtb_18365', 'mtb_18382', 'mtb_18385', 'mtb_18386', 'mtb_18389', 'mtb_18398', 'mtb_18402', 'mtb_18406', 'mtb_18407', 'mtb_18415', 'mtb_18423', 'mtb_18440', 'mtb_18464', 'mtb_18468', 'mtb_18470', 'mtb_18477', 'mtb_18486', 'mtb_18488', 'mtb_18491', 'mtb_18496', 'mtb_18500', 'mtb_18509', 'mtb_18521', 'mtb_18536', 'mtb_18546', 'mtb_18555', 'mtb_18559', 'mtb_18566', 'mtb_18569', 'mtb_18578', 'mtb_18594', 'mtb_18601', 'mtb_18607']


cols_mtb = [i for i in features if i.startswith('mtb')]
cols_mtb[:2], len(cols_mtb)

(['mtb_1368087', 'mtb_1380093'], 77)


num_exclude = len(features) - len(cols_mtb)
num_exclude

17


len(features[num_exclude:])

77


X = np.r_[X_train_under_scaled[:,num_exclude:],
          X_test_under_scaled[:,num_exclude:]
         ]
X.shape

(1408, 77)


y = np.r_[ser_ytrain_under.values, ser_ytest_under.values]
y.shape

(1408, 1)


features_bio = features[num_exclude:]
features_bio[:5], len(features_bio), X.shape[1]

(['mtb_1368087', 'mtb_1380093', 'mtb_1812369', 'mtb_1838668', 'mtb_1042362'],
 77,
 77)


from sklearn.cluster import KMeans


import plotly.graph_objects as go


wcss = []

for i in range(2, 10):
    kmeans = KMeans(n_clusters=i,init="k-means++",max_iter=500,n_init=10,random_state=SEED)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    skpmetrics.plot_silhouette(X,kmeans.labels_,title=f"Num of clusters = {i}")


fig = go.Figure(data=go.Scatter(x=list(range(1,10)), y=wcss))

fig.update_layout(title='Elbow method',
                   xaxis_title='Number of Clusters',
                   yaxis_title='WCSS')

fig['layout']['title_x'] = 0.5
fig.show()


from yellowbrick.cluster import KElbowVisualizer

kmeans = KMeans(init="k-means++",max_iter=500,n_init=10,random_state=SEED)
visu = KElbowVisualizer(kmeans, k=(2, 10))
visu.fit(X)
visu.show();


# choose appropriate number of clusters
n_clusters = 2


kmeans = KMeans(n_clusters=n_clusters,
                init="k-means++",max_iter=500,
                n_init=10,
                random_state=SEED)

idx_clusters = kmeans.fit_predict(X)

idx_clusters[:5]

array([0, 0, 0, 0, 0], dtype=int32)


np.bincount(idx_clusters)

array([1322,   86])


collections.Counter(idx_clusters)

Counter({0: 1322, 1: 86})


bp.show_methods(skpmetrics)


# skpmetrics.plot_silhouette?


skpmetrics.plot_silhouette(X,idx_clusters);


idx_clusters[:5]

array([0, 0, 0, 0, 0], dtype=int32)


# from question 2, we had found that following two biomarkers were most imoportant
# 'mtb_18470', 'mtb_606773',

imp_feats = ['mtb_18470', 'mtb_606773', 'mtb_1368087',
                      'mtb_509192', 'mtb_1230298', 'mtb_18464',
                      'mtb_1391826', 'mtb_1435571', 'mtb_590255',
                      'mtb_1940724', 'mtb_1228672', 'mtb_18350', 'mtb_18327']


features[num_exclude:].index('mtb_1368087')

0


idx_imp_feats = [features[num_exclude:].index(i) for i in imp_feats]
idx_imp_feats[:5]

[58, 14, 0, 20, 21]


centroids = kmeans.cluster_centers_
centroids.shape

(2, 77)


center_x0 = centroids[0,idx_imp_feats[0]]
center_y0 = centroids[1,idx_imp_feats[0]]

center_x1 = centroids[0,idx_imp_feats[1]]
center_y1 = centroids[1,idx_imp_feats[1]]

center_x0, center_y0

(0.22513945817974862, 0.7682914905100586)


plt.figure(figsize=(12,8))
plt.scatter(x=X[:,idx_imp_feats[0]],
            y=X[:,idx_imp_feats[1]],
            c=['red' if i == 0 else 'green' for i in idx_clusters])

plt.scatter(center_x0,center_y0,color='blue',s=80)
plt.scatter(center_x1,center_y1,color='black',s=80)

plt.xlabel(imp_feats[0])
plt.ylabel(imp_feats[1])
plt.title("Clustering of two important features")
plt.show()

print("center of labels 0 (red) is blue and center of labels 1 (green) is black.")

center of labels 0 (red) is blue and center of labels 1 (green) is black.


m,n = 0,2 # important feature index

plt.figure(figsize=(12,8))
plt.scatter(x=X[:,idx_imp_feats[0]],
            y=X[:,idx_imp_feats[2]],
            c=['red' if i == 0 else 'green' for i in idx_clusters])

plt.scatter(centroids[0,idx_imp_feats[m]],centroids[1,idx_imp_feats[m]],color='blue',s=80,alpha=0.5)
plt.scatter(centroids[0,idx_imp_feats[n]],centroids[1,idx_imp_feats[n]],color='black',s=80,alpha=0.5)

plt.xlabel(imp_feats[m])
plt.ylabel(imp_feats[n])
plt.title("Clustering of two important features")
plt.show()


import plotly.figure_factory as ff
import scipy.cluster.hierarchy as sch


plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = sch.dendrogram(sch.linkage(X, method='ward'))


plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = sch.dendrogram(sch.linkage(X[:,idx_imp_feats],
                    method='ward')
                     )


# Create 5 clusters
hc_complete = sch.linkage(X, "complete")

plt.figure(figsize=(15, 10))
plt.title("Hierarchical Clustering")
plt.xlabel("Observations")
plt.ylabel("Distance")
sch.dendrogram(hc_complete,
           truncate_mode="lastp",
           p=5,
           show_contracted=True,
           leaf_font_size=10)
plt.show()


sys.path.append(os.path.expanduser("~/Dropbox/a01_Resources/kmeans_interp"))

from kmeans_feature_imp import KMeansInterp


kmeans_int = KMeansInterp(n_clusters=2, 
                   random_state=SEED, 
                   ordered_feature_names=features_bio, 
                   feature_importance_method='wcss_min',
                  )

kmeans_int.fit(X)

labels = kmeans_int.labels_


for cluster_label, feature_weights in kmeans_int.feature_importances_.items():    
    df_feature_weight = pd.DataFrame(feature_weights[:15], columns=["Feature", "Weight"])
    fig, ax = plt.subplots(figsize=(14,6))
    sns.barplot(x="Feature", y="Weight", data=df_feature_weight)
    plt.xticks(rotation=-45, ha="left");
    ax.tick_params(axis='both', which='major', labelsize=22)
    plt.title(f'Highest Weight Features in Cluster {cluster_label}', fontsize='xx-large')
    plt.xlabel('Feature', fontsize=18)
    plt.ylabel('Weight', fontsize=18)

    plt.show();

    print('\n\n')


kmeans_int = KMeansInterp(n_clusters=5, 
                   random_state=SEED, 
                   ordered_feature_names=features_bio, 
                   feature_importance_method='wcss_min',
                  )

kmeans_int.fit(X)

labels = kmeans_int.labels_

for cluster_label, feature_weights in kmeans_int.feature_importances_.items():    
    df_feature_weight = pd.DataFrame(feature_weights[:15], columns=["Feature", "Weight"])
    fig, ax = plt.subplots(figsize=(14,6))
    sns.barplot(x="Feature", y="Weight", data=df_feature_weight)
    plt.xticks(rotation=-45, ha="left");
    ax.tick_params(axis='both', which='major', labelsize=22)
    plt.title(f'Highest Weight Features in Cluster {cluster_label}', fontsize='xx-large')
    plt.xlabel('Feature', fontsize=18)
    plt.ylabel('Weight', fontsize=18)

    plt.show();

    print('\n\n')

	incident_diabetes
index
658	0
1250	1
655	0
809	1
483	0

	0	1	2
0	LabelEncoder	itertools	plot_roc_curve
1	absolute_import	label_binarize	plot_silhouette
2	auc	plot_calibration_curve	precision_recall_curve
3	average_precision_score	plot_confusion_matrix	print_function
4	binary_ks_curve	plot_cumulative_gain	roc_curve
5	calibration_curve	plot_ks_statistic	silhouette_samples
6	confusion_matrix	plot_lift_curve	silhouette_score
7	cumulative_gain_curve	plot_precision_recall	unicode_literals
8	deprecated	plot_precision_recall_curve	unique_labels
9	division	plot_roc	validate_labels
10	interp

Table of Contents

Description¶

Question 01¶

Question 02¶

Question 03¶

Import the modules¶

Load the data¶

Select only biomarkers¶

Modelling kmeans clustering (no train-test split for unsupervised algos)¶

find best number of clusters¶

silhouette score plot¶

Hierarchical clustering¶

Feature importance of clustering¶

Using 2 clusters¶

Using 5 clusters¶

	0
0	diabetes_time
1	age