import time
time_start_notebook = time.time()

import numpy as np
import pandas as pd
import os,sys,time

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pandas_profiling

# settings
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth', 500)

# modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# visualization
import plotly_express as px
import scikitplot.metrics as skpmetrics
from sklearn.decomposition import PCA


%matplotlib inline
%load_ext watermark
%watermark -iv

seaborn          0.11.0
autopep8         1.5.2
json             2.0.9
plotly_express   0.4.1
pandas_profiling 2.11.0
numpy            1.21.5
pandas           1.3.4


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


!head -n 5 data/ds2.csv


df2 = pd.read_csv('data/ds2.csv',index_col=0)
df2.head(2).append(df2.tail(2))


df2.bp.describe()


profile = pandas_profiling.ProfileReport(df2)
profile.to_file('html/ds2_statistics_summary.html')
profile


df2.bp.plot_corr()


notes = """
- NO missing values
- NO duplicate rows
- Some variables are correlated e.g. x1 is positively correlated with x3 (+0.74)
  and x10 is negatively correlated with x7 (-0.77).
  
feature distributions:  
bi-modal like : x1 x3 x6 x7
gaussian-like : x2 x4 x5
non-gaussian  : x8 x9 x10

""";


df2.head(2)


df2.bp.plot_num_num('X1','X2')


features = list(df2.columns)

fig = px.scatter_matrix(df2,dimensions=features)
fig.update_traces(diagonal_visible=False)
fig['layout']['width'] = 1400
fig['layout']['height'] = 1400
fig.show()


# normalize the data before doing pca
X = df2.copy()
scaler = StandardScaler()
X = scaler.fit_transform(X)
X[:2,:]

array([[ 1.26164299,  0.24094577,  0.64025978, -0.09733976,  0.51394301,
        -1.26522949, -0.87255599, -1.50047008, -1.81321481,  0.28840666],
       [ 0.66214652,  1.74256925,  1.20156839,  0.22362707, -0.68125609,
         0.148326  ,  0.36709589, -0.16626457,  0.79942637, -1.49415533]])


pca = PCA(n_components=3)
X_pca3 = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter(
    X_pca3, x=0, y=1,
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2'}
)
fig.show()


note = """
We can see 4 different clusters using just two principal components.
"""


pca = PCA(n_components=3)
X_pca3 = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    X_pca3, x=0, y=1, z=2,
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()


note = """
3d is less easier to see the clusters.
We can still see 4 clusters.
"""


# Plotting explained variance
pca = PCA()
pca.fit(X)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)


note = """
When we use 4 compoents, it explains 87% of the total variance in the dataset
and after that explained variance increases very slowy. We can choose n_compoents=4
for our analysis.
"""


from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import scikitplot.metrics as skpmetrics


scaler = StandardScaler()
X = scaler.fit_transform(df2)

X.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])


# within-cluster sum of squares
wcss = {}
sils = {}
for i in range(2, 8):
    kmeans = KMeans(n_clusters=i,init="k-means++",max_iter=500,n_init=10,random_state=SEED)
    kmeans.fit(X)
    
    # wcss
    w = kmeans.inertia_
    wcss[i] = w
    
    # silhoutte score
    s = silhouette_score(X, kmeans.labels_, metric='euclidean')
    sils[i] = s
    
    title = f"Num of clusters = {i}, WCSS = {w:,.0f} Silhouette Score = {s:.4f}"
    skpmetrics.plot_silhouette(X,kmeans.labels_,title=title)


ax = pd.Series(wcss).rename('components').to_frame('wcss').plot.bar()
bp.add_text_barplot(ax,fmt="{:,.0f}")
ax.set_xlabel('Number of components')
ax.set_ylabel("WCSS")
plt.title('Elbow Method');


ax = pd.Series(sils).rename('components').to_frame('silhouette_score').plot.bar()
bp.add_text_barplot(ax,fmt="{:.4f}")
ax.set_xlabel('Number of components')
ax.set_ylabel("silhouette_score")
plt.tight_layout()


note = """
We will use n_components = 4 for the analysis.

"""


n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters,random_state=SEED,init='k-means++')
kmeans_labels = kmeans.fit_predict(X)
pd.Series(kmeans_labels).value_counts()

1    510
0    510
3    491
2    489
dtype: int64


df22 = df2.copy()
df22['Cluster']=kmeans_labels

df22.head(2)


for feature in features:
    grid= sns.FacetGrid(df22, col='Cluster',hue='Cluster')
    grid.map(plt.hist, feature)


centroids = kmeans.cluster_centers_

pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit(X)
centroids_pca = pca_2.transform(centroids)

X.shape, centroids_pca.shape, centroids_pca # 4 clusters and 4 cetroids

((2000, 10),
 (4, 2),
 array([[ 3.19511363,  0.97178709],
        [-1.77987661, -1.01179325],
        [ 0.9048578 , -1.52019541],
        [-2.37117382,  1.55555743]]))


# use each cluster centroid to define the cluster characteristics
df2_centroids = pd.DataFrame(data=scaler.inverse_transform(centroids),
                             columns=df2.columns)
df2_centroids.head()


# we can use pca for visualization


pca = PCA(n_components=2)
X_pca2 = pca.fit_transform(X)

X_pca2[:5]

array([[-2.54928204, -0.73283501],
       [ 0.91067867, -2.03319601],
       [ 1.24618523, -0.86629786],
       [-1.75156251, -0.31326997],
       [-1.49226908, -0.71042546]])


centroids_pca

array([[ 3.19511363,  0.97178709],
       [-1.77987661, -1.01179325],
       [ 0.9048578 , -1.52019541],
       [-2.37117382,  1.55555743]])


plt.scatter(X_pca2[:, 0],X_pca2[:, 1],c=kmeans_labels,cmap='plasma')

plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1],
            marker='x', s=150,
            color='black', zorder=10,lw=3)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA visualization',fontweight='bold')
plt.show()


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 55 secs

	X1	X2	X3	X4	X5	X6	X7	X8	X9	X10
1	23.778224	13.319974	15.565124	-3.713626	7.296793	-19.371013	-0.894130	-6.110282	-28.959316	2.851336
2	16.602950	23.311281	21.099052	-0.304154	-3.218990	2.357643	12.027277	7.070349	-5.762185	-23.050198
1999	-10.355833	15.105070	-5.705684	7.196082	0.849879	19.485150	11.989341	26.697433	-0.763111	-5.759242
2000	16.033693	14.308373	12.222013	-9.343368	-2.822996	-15.729261	6.376017	-6.680791	-14.146463	-2.226976

	Type	N	Count	Unique	MissingPct	ZerosPct	OnesPct	Missing	Zeros	Ones	min	max	mean	std	25%	50%	75%	Feature2	smallest5	largest5	first5	last5
Feature
X1	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-25.82	32.27	8.68	11.97	0.23	12.75	17.36	X1	[-25.82419907, -24.37879263, -23.89911608, -23.66685647, -23.08376372]	[32.26857023, 31.12255512, 30.19006047, 29.85020973, 29.77035767]	[23.77822414, 16.60295038, 12.08468269, 13.04453404, 8.314115245]	[7.938971333, 12.29648557, 9.958680838, -10.35583339, 16.03369349]
X2	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-8.50	32.91	11.72	6.66	7.16	11.90	16.28	X2	[-8.497561602, -6.855363588, -6.751928113, -6.705537319, -6.54759643]	[32.90991722, 31.30134697, 30.32065178, 29.96522989, 29.55579554]	[13.31997449, 23.31128056, 19.71044344, 10.74903959, 6.748794241]	[14.41503188, 9.133867169, 2.660312782, 15.10506998, 14.30837267]
X3	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-23.67	31.23	9.25	9.86	2.65	11.42	16.50	X3	[-23.66643904, -20.75344786, -17.75174619, -16.45316054, -16.26374389]	[31.23055042, 30.69793197, 28.96127037, 28.79428566, 28.79169463]	[15.56512378, 21.09905242, 9.837101706, 5.884406563, 5.38853505]	[10.41080529, 6.703856472, 20.83385608, -5.705684287, 12.22201313]
X4	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-29.43	26.42	-2.68	10.63	-10.65	-2.63	5.34	X4	[-29.42965545, -27.94150064, -27.69128769, -27.34122324, -27.20449902]	[26.42279821, 24.28815641, 23.8585608, 23.56018018, 22.96591578]	[-3.713626311, -0.30415371, -1.081917765, -11.70352522, -0.000289643]	[0.596848467, 3.074667502, -13.24425303, 7.19608166, -9.343367904]
X5	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-22.03	29.31	2.77	8.80	-4.10	2.48	9.66	X5	[-22.03332927, -20.68389731, -17.63413688, -17.0258761, -16.64291807]	[29.31201017, 27.34896054, 25.87985344, 25.78957145, 25.5320661]	[7.296793243, -3.218989513, -1.201942452, -4.13435848, -4.724787417]	[0.754864377, -4.969635466, 19.29171982, 0.849879169, -2.822995904]
X6	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-35.26	31.73	0.08	15.38	-14.00	1.50	14.05	X6	[-35.26401872, -31.90162658, -29.91583821, -29.56397555, -29.52522987]	[31.72704204, 31.51417444, 31.01172321, 30.20981047, 29.72397965]	[-19.37101306, 2.357643428, 9.738018678, -22.3446658, -16.3468122]	[-22.31866776, 19.24915693, -9.732882147, 19.48514959, -15.72926074]
X7	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-21.43	32.08	8.20	10.43	-0.81	8.53	17.14	X7	[-21.42853758, -16.52825669, -16.32840346, -15.71479108, -15.49190401]	[32.08429705, 31.06235839, 30.32677104, 30.25014934, 29.93066368]	[-0.894129725, 12.02727714, 16.1259204, -1.26334898, 3.293600093]	[1.326245718, 13.64344771, -3.350363874, 11.98934128, 6.376017442]
X8	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-16.81	36.85	8.71	9.88	1.48	9.63	16.08	X8	[-16.81114581, -16.45460245, -16.44206238, -15.68374798, -15.50747378]	[36.84792164, 36.3199105, 34.56662439, 34.23336165, 32.29530674]	[-6.110282234, 7.070348879, 19.11939125, 0.493710809, -10.84827261]	[-5.919270185, 8.458499608, 3.186218322, 26.69743265, -6.680790611]
X9	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-36.07	13.55	-12.86	8.88	-19.43	-14.42	-6.53	X9	[-36.06514959, -33.89000207, -32.12712241, -31.70787312, -31.19797019]	[13.55370484, 12.38259851, 11.98014196, 11.97818464, 11.16452922]	[-28.95931643, -5.762184567, -15.58212228, -15.3053474, -17.28549147]	[-14.14123269, -18.05844344, -18.59882188, -0.763110534, -14.14646294]
X10	float64	2,000	2,000	2,000	0.00%	0.00%	0.00%	0	0	0	-36.47	32.64	-1.34	14.53	-13.22	-2.09	10.56	X10	[-36.46808278, -32.9266076, -32.29774236, -31.92839559, -30.74190331]	[32.64178858, 32.14936889, 31.75232344, 29.81904268, 29.29564822]	[2.851335711, -23.05019771, -12.29253529, 6.799086974, 6.034213769]	[2.284455017, -22.98328057, 20.42371636, -5.759242305, -2.22697571]

	X1	X2	X3	X4	X5	X6	X7	X8	X9	X10
1	23.778224	13.319974	15.565124	-3.713626	7.296793	-19.371013	-0.894130	-6.110282	-28.959316	2.851336
2	16.602950	23.311281	21.099052	-0.304154	-3.218990	2.357643	12.027277	7.070349	-5.762185	-23.050198

	X1	X2	X3	X4	X5	X6	X7	X8	X9	X10	Cluster
1	23.778224	13.319974	15.565124	-3.713626	7.296793	-19.371013	-0.894130	-6.110282	-28.959316	2.851336	1
2	16.602950	23.311281	21.099052	-0.304154	-3.218990	2.357643	12.027277	7.070349	-5.762185	-23.050198	2

	X1	X2	X3	X4	X5	X6	X7	X8	X9	X10
0	-9.712850	16.744653	-4.848168	10.163147	6.201955	16.198990	16.227576	19.646245	-1.272241	-7.630330
1	15.120425	10.638894	11.000549	-5.586927	-5.583719	-18.456556	1.620782	-4.188975	-18.836370	3.444770
2	17.024321	14.365331	14.990692	0.006055	-2.236760	12.068201	18.013695	10.415960	-12.135521	-19.262561
3	12.775766	4.976265	16.369598	-15.674341	12.888714	-9.357903	-3.074523	9.061394	-19.410610	18.075984

Table of Contents

Description¶

Load the libraries¶

Clustering of the data¶

data description¶

Visualization of clusters¶

Find number of clusters¶

Visualization of clusters with centroids¶

Time Taken¶