Hints:
Hints:
Hints:
References:
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import os,sys,time
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# modules
import missingno as msno
import pandas_profiling
from pandas_profiling import ProfileReport
from tqdm import tqdm
# modelling
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.smpickle import load_pickle
# model evaluation
import scikitplot
from scikitplot import metrics as skpmetrics
import collections
# settings
SEED = 100
pd.set_option('max_columns',100)
%matplotlib inline
%load_ext watermark
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark sklearn 0.23.1 statsmodels.api 0.12.2 pandas_profiling 2.11.0 seaborn 0.11.0 pandas 1.3.0 scikitplot 0.3.7 numpy 1.19.5 missingno 0.4.2
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
!ls ~/data
X_test_under_scaled.npz first_10.csv X_train_under_scaled.npz ser_dtype.csv df_under_features.csv ser_ytest_under.csv diabetes_project_dataset.csv ser_ytrain_under.csv diabetes_project_dataset.hdf5
df_features = pd.read_csv(os.path.expanduser("~/data/df_under_features.csv"))
df_features.head(2)
0 | |
---|---|
0 | diabetes_time |
1 | age |
features = df_features.iloc[:,0].to_list()
features[:5], len(features)
(['diabetes_time', 'age', 'male', 'BMI', 'HDL'], 94)
%%time
# read numpy pickled files
p = os.path.expanduser("~/data/X_train_under_scaled.npz")
X_train_under_scaled = np.load(p)['data']
p = os.path.expanduser("~/data/X_test_under_scaled.npz")
X_test_under_scaled = np.load(p)['data']
X_train_under_scaled.shape, X_test_under_scaled.shape
CPU times: user 8.77 ms, sys: 2.32 ms, total: 11.1 ms Wall time: 18.7 ms
((1126, 94), (282, 94))
# read pandas series
ser_ytrain_under = pd.read_csv(os.path.expanduser("~/data/ser_ytrain_under.csv"))
ser_ytrain_under = ser_ytrain_under.set_index("index")
ser_ytest_under = pd.read_csv(os.path.expanduser("~/data/ser_ytest_under.csv"))
ser_ytest_under = ser_ytest_under.set_index("index")
ser_ytrain_under.head()
incident_diabetes | |
---|---|
index | |
658 | 0 |
1250 | 1 |
655 | 0 |
809 | 1 |
483 | 0 |
print(features)
['diabetes_time', 'age', 'male', 'BMI', 'HDL', 'LDL', 'trig', 'SBP', 'DBP', 'hypertension', 'fasting', 'current_smoker', 'ex_smoker', 'exercise', 'healthy_vegetables', 'junk_food', 'total_fiber', 'mtb_1368087', 'mtb_1380093', 'mtb_1812369', 'mtb_1838668', 'mtb_1042362', 'mtb_1091716', 'mtb_1228672', 'mtb_1542487', 'mtb_1272352', 'mtb_1391826', 'mtb_1435571', 'mtb_1521753', 'mtb_1834574', 'mtb_1050860', 'mtb_606773', 'mtb_638620', 'mtb_752773', 'mtb_590255', 'mtb_709794', 'mtb_352255', 'mtb_509192', 'mtb_1230298', 'mtb_841524', 'mtb_1957718', 'mtb_1937123', 'mtb_1940724', 'mtb_18238', 'mtb_18261', 'mtb_18262', 'mtb_18266', 'mtb_18274', 'mtb_18296', 'mtb_18299', 'mtb_18323', 'mtb_18324', 'mtb_18325', 'mtb_18326', 'mtb_18327', 'mtb_18333', 'mtb_18350', 'mtb_18351', 'mtb_18359', 'mtb_18362', 'mtb_18364', 'mtb_18365', 'mtb_18382', 'mtb_18385', 'mtb_18386', 'mtb_18389', 'mtb_18398', 'mtb_18402', 'mtb_18406', 'mtb_18407', 'mtb_18415', 'mtb_18423', 'mtb_18440', 'mtb_18464', 'mtb_18468', 'mtb_18470', 'mtb_18477', 'mtb_18486', 'mtb_18488', 'mtb_18491', 'mtb_18496', 'mtb_18500', 'mtb_18509', 'mtb_18521', 'mtb_18536', 'mtb_18546', 'mtb_18555', 'mtb_18559', 'mtb_18566', 'mtb_18569', 'mtb_18578', 'mtb_18594', 'mtb_18601', 'mtb_18607']
cols_mtb = [i for i in features if i.startswith('mtb')]
cols_mtb[:2], len(cols_mtb)
(['mtb_1368087', 'mtb_1380093'], 77)
num_exclude = len(features) - len(cols_mtb)
num_exclude
17
len(features[num_exclude:])
77
X = np.r_[X_train_under_scaled[:,num_exclude:],
X_test_under_scaled[:,num_exclude:]
]
X.shape
(1408, 77)
y = np.r_[ser_ytrain_under.values, ser_ytest_under.values]
y.shape
(1408, 1)
features_bio = features[num_exclude:]
features_bio[:5], len(features_bio), X.shape[1]
(['mtb_1368087', 'mtb_1380093', 'mtb_1812369', 'mtb_1838668', 'mtb_1042362'], 77, 77)
from sklearn.cluster import KMeans
import plotly.graph_objects as go
wcss = []
for i in range(2, 10):
kmeans = KMeans(n_clusters=i,init="k-means++",max_iter=500,n_init=10,random_state=SEED)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
skpmetrics.plot_silhouette(X,kmeans.labels_,title=f"Num of clusters = {i}")
fig = go.Figure(data=go.Scatter(x=list(range(1,10)), y=wcss))
fig.update_layout(title='Elbow method',
xaxis_title='Number of Clusters',
yaxis_title='WCSS')
fig['layout']['title_x'] = 0.5
fig.show()
from yellowbrick.cluster import KElbowVisualizer
kmeans = KMeans(init="k-means++",max_iter=500,n_init=10,random_state=SEED)
visu = KElbowVisualizer(kmeans, k=(2, 10))
visu.fit(X)
visu.show();
# choose appropriate number of clusters
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters,
init="k-means++",max_iter=500,
n_init=10,
random_state=SEED)
idx_clusters = kmeans.fit_predict(X)
idx_clusters[:5]
array([0, 0, 0, 0, 0], dtype=int32)
np.bincount(idx_clusters)
array([1322, 86])
collections.Counter(idx_clusters)
Counter({0: 1322, 1: 86})
bp.show_methods(skpmetrics)
0 | 1 | 2 | |
---|---|---|---|
0 | LabelEncoder | itertools | plot_roc_curve |
1 | absolute_import | label_binarize | plot_silhouette |
2 | auc | plot_calibration_curve | precision_recall_curve |
3 | average_precision_score | plot_confusion_matrix | print_function |
4 | binary_ks_curve | plot_cumulative_gain | roc_curve |
5 | calibration_curve | plot_ks_statistic | silhouette_samples |
6 | confusion_matrix | plot_lift_curve | silhouette_score |
7 | cumulative_gain_curve | plot_precision_recall | unicode_literals |
8 | deprecated | plot_precision_recall_curve | unique_labels |
9 | division | plot_roc | validate_labels |
10 | interp |
# skpmetrics.plot_silhouette?
skpmetrics.plot_silhouette(X,idx_clusters);
idx_clusters[:5]
array([0, 0, 0, 0, 0], dtype=int32)
# from question 2, we had found that following two biomarkers were most imoportant
# 'mtb_18470', 'mtb_606773',
imp_feats = ['mtb_18470', 'mtb_606773', 'mtb_1368087',
'mtb_509192', 'mtb_1230298', 'mtb_18464',
'mtb_1391826', 'mtb_1435571', 'mtb_590255',
'mtb_1940724', 'mtb_1228672', 'mtb_18350', 'mtb_18327']
features[num_exclude:].index('mtb_1368087')
0
idx_imp_feats = [features[num_exclude:].index(i) for i in imp_feats]
idx_imp_feats[:5]
[58, 14, 0, 20, 21]
centroids = kmeans.cluster_centers_
centroids.shape
(2, 77)
center_x0 = centroids[0,idx_imp_feats[0]]
center_y0 = centroids[1,idx_imp_feats[0]]
center_x1 = centroids[0,idx_imp_feats[1]]
center_y1 = centroids[1,idx_imp_feats[1]]
center_x0, center_y0
(0.22513945817974862, 0.7682914905100586)
plt.figure(figsize=(12,8))
plt.scatter(x=X[:,idx_imp_feats[0]],
y=X[:,idx_imp_feats[1]],
c=['red' if i == 0 else 'green' for i in idx_clusters])
plt.scatter(center_x0,center_y0,color='blue',s=80)
plt.scatter(center_x1,center_y1,color='black',s=80)
plt.xlabel(imp_feats[0])
plt.ylabel(imp_feats[1])
plt.title("Clustering of two important features")
plt.show()
print("center of labels 0 (red) is blue and center of labels 1 (green) is black.")
center of labels 0 (red) is blue and center of labels 1 (green) is black.
m,n = 0,2 # important feature index
plt.figure(figsize=(12,8))
plt.scatter(x=X[:,idx_imp_feats[0]],
y=X[:,idx_imp_feats[2]],
c=['red' if i == 0 else 'green' for i in idx_clusters])
plt.scatter(centroids[0,idx_imp_feats[m]],centroids[1,idx_imp_feats[m]],color='blue',s=80,alpha=0.5)
plt.scatter(centroids[0,idx_imp_feats[n]],centroids[1,idx_imp_feats[n]],color='black',s=80,alpha=0.5)
plt.xlabel(imp_feats[m])
plt.ylabel(imp_feats[n])
plt.title("Clustering of two important features")
plt.show()
import plotly.figure_factory as ff
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(10, 7))
plt.title("Dendrograms")
dend = sch.dendrogram(sch.linkage(X, method='ward'))
plt.figure(figsize=(10, 7))
plt.title("Dendrograms")
dend = sch.dendrogram(sch.linkage(X[:,idx_imp_feats],
method='ward')
)
# Create 5 clusters
hc_complete = sch.linkage(X, "complete")
plt.figure(figsize=(15, 10))
plt.title("Hierarchical Clustering")
plt.xlabel("Observations")
plt.ylabel("Distance")
sch.dendrogram(hc_complete,
truncate_mode="lastp",
p=5,
show_contracted=True,
leaf_font_size=10)
plt.show()
sys.path.append(os.path.expanduser("~/Dropbox/a01_Resources/kmeans_interp"))
from kmeans_feature_imp import KMeansInterp
kmeans_int = KMeansInterp(n_clusters=2,
random_state=SEED,
ordered_feature_names=features_bio,
feature_importance_method='wcss_min',
)
kmeans_int.fit(X)
labels = kmeans_int.labels_
for cluster_label, feature_weights in kmeans_int.feature_importances_.items():
df_feature_weight = pd.DataFrame(feature_weights[:15], columns=["Feature", "Weight"])
fig, ax = plt.subplots(figsize=(14,6))
sns.barplot(x="Feature", y="Weight", data=df_feature_weight)
plt.xticks(rotation=-45, ha="left");
ax.tick_params(axis='both', which='major', labelsize=22)
plt.title(f'Highest Weight Features in Cluster {cluster_label}', fontsize='xx-large')
plt.xlabel('Feature', fontsize=18)
plt.ylabel('Weight', fontsize=18)
plt.show();
print('\n\n')
kmeans_int = KMeansInterp(n_clusters=5,
random_state=SEED,
ordered_feature_names=features_bio,
feature_importance_method='wcss_min',
)
kmeans_int.fit(X)
labels = kmeans_int.labels_
for cluster_label, feature_weights in kmeans_int.feature_importances_.items():
df_feature_weight = pd.DataFrame(feature_weights[:15], columns=["Feature", "Weight"])
fig, ax = plt.subplots(figsize=(14,6))
sns.barplot(x="Feature", y="Weight", data=df_feature_weight)
plt.xticks(rotation=-45, ha="left");
ax.tick_params(axis='both', which='major', labelsize=22)
plt.title(f'Highest Weight Features in Cluster {cluster_label}', fontsize='xx-large')
plt.xlabel('Feature', fontsize=18)
plt.ylabel('Weight', fontsize=18)
plt.show();
print('\n\n')