import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import os,sys,time
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pandas_profiling
# settings
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth', 500)
# modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# visualization
import plotly_express as px
import scikitplot.metrics as skpmetrics
from sklearn.decomposition import PCA
%matplotlib inline
%load_ext watermark
%watermark -iv
seaborn 0.11.0 autopep8 1.5.2 json 2.0.9 plotly_express 0.4.1 pandas_profiling 2.11.0 numpy 1.21.5 pandas 1.3.4
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
!head -n 5 data/ds2.csv
df2 = pd.read_csv('data/ds2.csv',index_col=0)
df2.head(2).append(df2.tail(2))
X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 23.778224 | 13.319974 | 15.565124 | -3.713626 | 7.296793 | -19.371013 | -0.894130 | -6.110282 | -28.959316 | 2.851336 |
2 | 16.602950 | 23.311281 | 21.099052 | -0.304154 | -3.218990 | 2.357643 | 12.027277 | 7.070349 | -5.762185 | -23.050198 |
1999 | -10.355833 | 15.105070 | -5.705684 | 7.196082 | 0.849879 | 19.485150 | 11.989341 | 26.697433 | -0.763111 | -5.759242 |
2000 | 16.033693 | 14.308373 | 12.222013 | -9.343368 | -2.822996 | -15.729261 | 6.376017 | -6.680791 | -14.146463 | -2.226976 |
df2.bp.describe()
Type | N | Count | Unique | MissingPct | ZerosPct | OnesPct | Missing | Zeros | Ones | min | max | mean | std | 25% | 50% | 75% | Feature2 | smallest5 | largest5 | first5 | last5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Feature | ||||||||||||||||||||||
X1 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -25.82 | 32.27 | 8.68 | 11.97 | 0.23 | 12.75 | 17.36 | X1 | [-25.82419907, -24.37879263, -23.89911608, -23.66685647, -23.08376372] | [32.26857023, 31.12255512, 30.19006047, 29.85020973, 29.77035767] | [23.77822414, 16.60295038, 12.08468269, 13.04453404, 8.314115245] | [7.938971333, 12.29648557, 9.958680838, -10.35583339, 16.03369349] |
X2 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -8.50 | 32.91 | 11.72 | 6.66 | 7.16 | 11.90 | 16.28 | X2 | [-8.497561602, -6.855363588, -6.751928113, -6.705537319, -6.54759643] | [32.90991722, 31.30134697, 30.32065178, 29.96522989, 29.55579554] | [13.31997449, 23.31128056, 19.71044344, 10.74903959, 6.748794241] | [14.41503188, 9.133867169, 2.660312782, 15.10506998, 14.30837267] |
X3 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -23.67 | 31.23 | 9.25 | 9.86 | 2.65 | 11.42 | 16.50 | X3 | [-23.66643904, -20.75344786, -17.75174619, -16.45316054, -16.26374389] | [31.23055042, 30.69793197, 28.96127037, 28.79428566, 28.79169463] | [15.56512378, 21.09905242, 9.837101706, 5.884406563, 5.38853505] | [10.41080529, 6.703856472, 20.83385608, -5.705684287, 12.22201313] |
X4 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -29.43 | 26.42 | -2.68 | 10.63 | -10.65 | -2.63 | 5.34 | X4 | [-29.42965545, -27.94150064, -27.69128769, -27.34122324, -27.20449902] | [26.42279821, 24.28815641, 23.8585608, 23.56018018, 22.96591578] | [-3.713626311, -0.30415371, -1.081917765, -11.70352522, -0.000289643] | [0.596848467, 3.074667502, -13.24425303, 7.19608166, -9.343367904] |
X5 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -22.03 | 29.31 | 2.77 | 8.80 | -4.10 | 2.48 | 9.66 | X5 | [-22.03332927, -20.68389731, -17.63413688, -17.0258761, -16.64291807] | [29.31201017, 27.34896054, 25.87985344, 25.78957145, 25.5320661] | [7.296793243, -3.218989513, -1.201942452, -4.13435848, -4.724787417] | [0.754864377, -4.969635466, 19.29171982, 0.849879169, -2.822995904] |
X6 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -35.26 | 31.73 | 0.08 | 15.38 | -14.00 | 1.50 | 14.05 | X6 | [-35.26401872, -31.90162658, -29.91583821, -29.56397555, -29.52522987] | [31.72704204, 31.51417444, 31.01172321, 30.20981047, 29.72397965] | [-19.37101306, 2.357643428, 9.738018678, -22.3446658, -16.3468122] | [-22.31866776, 19.24915693, -9.732882147, 19.48514959, -15.72926074] |
X7 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -21.43 | 32.08 | 8.20 | 10.43 | -0.81 | 8.53 | 17.14 | X7 | [-21.42853758, -16.52825669, -16.32840346, -15.71479108, -15.49190401] | [32.08429705, 31.06235839, 30.32677104, 30.25014934, 29.93066368] | [-0.894129725, 12.02727714, 16.1259204, -1.26334898, 3.293600093] | [1.326245718, 13.64344771, -3.350363874, 11.98934128, 6.376017442] |
X8 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -16.81 | 36.85 | 8.71 | 9.88 | 1.48 | 9.63 | 16.08 | X8 | [-16.81114581, -16.45460245, -16.44206238, -15.68374798, -15.50747378] | [36.84792164, 36.3199105, 34.56662439, 34.23336165, 32.29530674] | [-6.110282234, 7.070348879, 19.11939125, 0.493710809, -10.84827261] | [-5.919270185, 8.458499608, 3.186218322, 26.69743265, -6.680790611] |
X9 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -36.07 | 13.55 | -12.86 | 8.88 | -19.43 | -14.42 | -6.53 | X9 | [-36.06514959, -33.89000207, -32.12712241, -31.70787312, -31.19797019] | [13.55370484, 12.38259851, 11.98014196, 11.97818464, 11.16452922] | [-28.95931643, -5.762184567, -15.58212228, -15.3053474, -17.28549147] | [-14.14123269, -18.05844344, -18.59882188, -0.763110534, -14.14646294] |
X10 | float64 | 2,000 | 2,000 | 2,000 | 0.00% | 0.00% | 0.00% | 0 | 0 | 0 | -36.47 | 32.64 | -1.34 | 14.53 | -13.22 | -2.09 | 10.56 | X10 | [-36.46808278, -32.9266076, -32.29774236, -31.92839559, -30.74190331] | [32.64178858, 32.14936889, 31.75232344, 29.81904268, 29.29564822] | [2.851335711, -23.05019771, -12.29253529, 6.799086974, 6.034213769] | [2.284455017, -22.98328057, 20.42371636, -5.759242305, -2.22697571] |
profile = pandas_profiling.ProfileReport(df2)
profile.to_file('html/ds2_statistics_summary.html')
profile
df2.bp.plot_corr()
notes = """
- NO missing values
- NO duplicate rows
- Some variables are correlated e.g. x1 is positively correlated with x3 (+0.74)
and x10 is negatively correlated with x7 (-0.77).
feature distributions:
bi-modal like : x1 x3 x6 x7
gaussian-like : x2 x4 x5
non-gaussian : x8 x9 x10
""";
References:
df2.head(2)
X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 23.778224 | 13.319974 | 15.565124 | -3.713626 | 7.296793 | -19.371013 | -0.894130 | -6.110282 | -28.959316 | 2.851336 |
2 | 16.602950 | 23.311281 | 21.099052 | -0.304154 | -3.218990 | 2.357643 | 12.027277 | 7.070349 | -5.762185 | -23.050198 |
df2.bp.plot_num_num('X1','X2')
features = list(df2.columns)
fig = px.scatter_matrix(df2,dimensions=features)
fig.update_traces(diagonal_visible=False)
fig['layout']['width'] = 1400
fig['layout']['height'] = 1400
fig.show()
# normalize the data before doing pca
X = df2.copy()
scaler = StandardScaler()
X = scaler.fit_transform(X)
X[:2,:]
array([[ 1.26164299, 0.24094577, 0.64025978, -0.09733976, 0.51394301, -1.26522949, -0.87255599, -1.50047008, -1.81321481, 0.28840666], [ 0.66214652, 1.74256925, 1.20156839, 0.22362707, -0.68125609, 0.148326 , 0.36709589, -0.16626457, 0.79942637, -1.49415533]])
pca = PCA(n_components=3)
X_pca3 = pca.fit_transform(X)
total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter(
X_pca3, x=0, y=1,
title=f'Total Explained Variance: {total_var:.2f}%',
labels={'0': 'PC 1', '1': 'PC 2'}
)
fig.show()
note = """
We can see 4 different clusters using just two principal components.
"""
pca = PCA(n_components=3)
X_pca3 = pca.fit_transform(X)
total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter_3d(
X_pca3, x=0, y=1, z=2,
title=f'Total Explained Variance: {total_var:.2f}%',
labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()
note = """
3d is less easier to see the clusters.
We can still see 4 clusters.
"""
# Plotting explained variance
pca = PCA()
pca.fit(X)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
px.area(
x=range(1, exp_var_cumul.shape[0] + 1),
y=exp_var_cumul,
labels={"x": "# Components", "y": "Explained Variance"}
)
note = """
When we use 4 compoents, it explains 87% of the total variance in the dataset
and after that explained variance increases very slowy. We can choose n_compoents=4
for our analysis.
"""
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import scikitplot.metrics as skpmetrics
scaler = StandardScaler()
X = scaler.fit_transform(df2)
X.std(axis=0)
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
# within-cluster sum of squares
wcss = {}
sils = {}
for i in range(2, 8):
kmeans = KMeans(n_clusters=i,init="k-means++",max_iter=500,n_init=10,random_state=SEED)
kmeans.fit(X)
# wcss
w = kmeans.inertia_
wcss[i] = w
# silhoutte score
s = silhouette_score(X, kmeans.labels_, metric='euclidean')
sils[i] = s
title = f"Num of clusters = {i}, WCSS = {w:,.0f} Silhouette Score = {s:.4f}"
skpmetrics.plot_silhouette(X,kmeans.labels_,title=title)
ax = pd.Series(wcss).rename('components').to_frame('wcss').plot.bar()
bp.add_text_barplot(ax,fmt="{:,.0f}")
ax.set_xlabel('Number of components')
ax.set_ylabel("WCSS")
plt.title('Elbow Method');
ax = pd.Series(sils).rename('components').to_frame('silhouette_score').plot.bar()
bp.add_text_barplot(ax,fmt="{:.4f}")
ax.set_xlabel('Number of components')
ax.set_ylabel("silhouette_score")
plt.tight_layout()
note = """
We will use n_components = 4 for the analysis.
"""
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters,random_state=SEED,init='k-means++')
kmeans_labels = kmeans.fit_predict(X)
pd.Series(kmeans_labels).value_counts()
1 510 0 510 3 491 2 489 dtype: int64
df22 = df2.copy()
df22['Cluster']=kmeans_labels
df22.head(2)
X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | Cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 23.778224 | 13.319974 | 15.565124 | -3.713626 | 7.296793 | -19.371013 | -0.894130 | -6.110282 | -28.959316 | 2.851336 | 1 |
2 | 16.602950 | 23.311281 | 21.099052 | -0.304154 | -3.218990 | 2.357643 | 12.027277 | 7.070349 | -5.762185 | -23.050198 | 2 |
for feature in features:
grid= sns.FacetGrid(df22, col='Cluster',hue='Cluster')
grid.map(plt.hist, feature)
centroids = kmeans.cluster_centers_
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit(X)
centroids_pca = pca_2.transform(centroids)
X.shape, centroids_pca.shape, centroids_pca # 4 clusters and 4 cetroids
((2000, 10), (4, 2), array([[ 3.19511363, 0.97178709], [-1.77987661, -1.01179325], [ 0.9048578 , -1.52019541], [-2.37117382, 1.55555743]]))
# use each cluster centroid to define the cluster characteristics
df2_centroids = pd.DataFrame(data=scaler.inverse_transform(centroids),
columns=df2.columns)
df2_centroids.head()
X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -9.712850 | 16.744653 | -4.848168 | 10.163147 | 6.201955 | 16.198990 | 16.227576 | 19.646245 | -1.272241 | -7.630330 |
1 | 15.120425 | 10.638894 | 11.000549 | -5.586927 | -5.583719 | -18.456556 | 1.620782 | -4.188975 | -18.836370 | 3.444770 |
2 | 17.024321 | 14.365331 | 14.990692 | 0.006055 | -2.236760 | 12.068201 | 18.013695 | 10.415960 | -12.135521 | -19.262561 |
3 | 12.775766 | 4.976265 | 16.369598 | -15.674341 | 12.888714 | -9.357903 | -3.074523 | 9.061394 | -19.410610 | 18.075984 |
# we can use pca for visualization
pca = PCA(n_components=2)
X_pca2 = pca.fit_transform(X)
X_pca2[:5]
array([[-2.54928204, -0.73283501], [ 0.91067867, -2.03319601], [ 1.24618523, -0.86629786], [-1.75156251, -0.31326997], [-1.49226908, -0.71042546]])
centroids_pca
array([[ 3.19511363, 0.97178709], [-1.77987661, -1.01179325], [ 0.9048578 , -1.52019541], [-2.37117382, 1.55555743]])
plt.scatter(X_pca2[:, 0],X_pca2[:, 1],c=kmeans_labels,cmap='plasma')
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1],
marker='x', s=150,
color='black', zorder=10,lw=3)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA visualization',fontweight='bold')
plt.show()
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 55 secs