import time
time_start_notebook = time.time()

import numpy as np
import pandas as pd
import pathlib
from pathlib import Path
import copy

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import plotly_express as px

# modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

import scikitplot.metrics as skpmetrics
from yellowbrick.cluster import SilhouetteVisualizer

# biopython
from Bio import SeqIO
from Bio.Cluster import kcluster,pca

# settings
SEED = 100
pd.set_option('max_columns',100)

%matplotlib inline
%load_ext watermark
%watermark -iv

numpy         : 1.19.5
pandas        : 1.2.4
Bio           : 1.79
matplotlib    : 3.3.4
autopep8      : 1.5.7
plotly_express: 0.4.1
json          : 2.0.9
scikitplot    : 0.3.7
seaborn       : 0.11.1


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


!ls *.fasta

Modified_Sequences.fasta


!head -n 1 Modified_Sequences.fasta

>sequence00000


!head -n 2 Modified_Sequences.fasta | tail -n 1 | wc -c

   34045


!head -n 2 Modified_Sequences.fasta | head -c 1000

>sequence00000
...........................................................------------------------------------------------------AGATCTGTTCTCTAAACGAAC..........TT.TAA.....AATC..TGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTA........TAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGG.T.TTCGTCCG.TTTTGCAGCCGATCATCAGCACATCTAGG..TTTTGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGG.TTTCAACGAG.AAAACACACGTCCAACTCAG.TTTGCCTG.TTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGC..ACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTG.AAAAAGGCG.TTTTGCCTCAACTTGAACAGCCCT.......C........G...G.....T...A............GTCATGTTA.............GC.G..................A.......C..................C................T.....G..C...A................................C.......C.....................C..................C..AAGTCA...TTT.......G.GACGAGCTTGGCACTGATCCTTATGAAGA.TTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACT.TCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAA


!head -n 2 Modified_Sequences.fasta | cut -c 100-200

--------------AGATCTGTTCTCTAAACGAAC..........TT.TAA.....AATC..TGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCAC


!cat Modified_Sequences.fasta |wc -l

     800


!sed -n '799p' < Modified_Sequences.fasta

>sequence00399


# we have files 00_000 to 00_399 (total 400 covid samples)


!du -sh Modified_Sequences.fasta

 13M	Modified_Sequences.fasta


sequences = []
for i, seq_record in enumerate(SeqIO.parse("Modified_Sequences.fasta", "fasta")):
    sequences.append(str(seq_record.seq))
#     print(seq_record.id)
#     print(repr(seq_record.seq))
#     print(len(seq_record))
#     print()
    
#     if i == 3:
#         break


seq = sequences[0]
seq[100:200]

'-------------AGATCTGTTCTCTAAACGAAC..........TT.TAA.....AATC..TGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCAC'


from Bio.Cluster import pca
from Bio.Cluster import kcluster

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import scikitplot.metrics as skpmetrics


from sklearn.feature_extraction.text import CountVectorizer


def get_kmers(seq, size):
    return [seq[x:x+size].upper() for x in range(len(seq) - size + 1)]


seq = 'AGATCTGTTCTCTAAACGAAC'
words = get_kmers(seq, size=6)
joined_sentence = ' '.join(words)
joined_sentence

'AGATCT GATCTG ATCTGT TCTGTT CTGTTC TGTTCT GTTCTC TTCTCT TCTCTA CTCTAA TCTAAA CTAAAC TAAACG AAACGA AACGAA ACGAAC'


seq = '-------------AGATCTGTTCTCTAAACGAAC..........TT.TAA'
words = get_kmers(seq, size=6)
joined_sentence = ' '.join(words)
joined_sentence

'------ ------ ------ ------ ------ ------ ------ ------ -----A ----AG ---AGA --AGAT -AGATC AGATCT GATCTG ATCTGT TCTGTT CTGTTC TGTTCT GTTCTC TTCTCT TCTCTA CTCTAA TCTAAA CTAAAC TAAACG AAACGA AACGAA ACGAAC CGAAC. GAAC.. AAC... AC.... C..... ...... ...... ...... ...... ...... .....T ....TT ...TT. ..TT.T .TT.TA TT.TAA'


seq = sequences[0]
words = get_kmers(seq, size=6)
joined_sentence = ' '.join(words)
len(seq), len(joined_sentence), joined_sentence[:10]

(34044, 238272, '...... ...')


seq0 = sequences[0]
seq1 = sequences[1]
sentence1 = ' '.join(get_kmers(seq0, size=6))
sentence2 = ' '.join(get_kmers(seq1, size=6))


cv = CountVectorizer()
XX = cv.fit_transform([sentence1, sentence2]).toarray()

XX.shape

(2, 4152)


cv = CountVectorizer(ngram_range=(4,4))
XX = cv.fit_transform([sentence1, sentence2]).toarray()

XX.shape

(2, 17093)


# This creates very long sequence and also we need to further treat no proteins (n).
# So, I have used simple numeric encoding


# get numpy array
X = np.asarray([np.frombuffer(s.encode('utf-8'), dtype=np.uint8)
                     for s in sequences])

print(len(sequences), len(sequences[0]), X.shape)
X[0][0]

400 34044 (400, 34044)

46


ord('.') # integer ascii value.

46


from sklearn.preprocessing import StandardScaler, MinMaxScaler


scaler = MinMaxScaler()
X_std = scaler.fit_transform(X)


clusterid, error, nfound = kcluster(X,nclusters=2)
print(clusterid)

[0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0
 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 0
 1 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 0 1 0 1 1 0 0 1 1 0 0
 0 1 1 0 1 0 1 1 1 1 0 0 1 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 1 1 0
 1 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0
 1 1 0 1 1 1 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 1 0
 0 0 0 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0 1
 1 0 1 1 1 0 0 1 1 1 1 0 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1
 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 1 0 1 1 1 1
 1 1 0 0 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0 1]


# using clustering on very high dimension comes with risk of
# curse of dimensionality and distance calculation is less realiable.


clusterid, error, nfound = kcluster(X,nclusters=2)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.scatter(X_pca[:, 0],X_pca[:, 1],c=clusterid,cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA visualization',fontweight='bold')
plt.show()


clusterid, error, nfound = kcluster(X_std,nclusters=2)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)

plt.scatter(X_pca[:, 0],X_pca[:, 1],c=clusterid,cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA visualization',fontweight='bold')
plt.show()


# length of one sequence is 34k, which is very large.
# we can use PCA to reduce the diemnsion of the data while keeping
# most of the variance.

# let's keep 90% of the variation in the data and cluster the data.


# get numpy array
X = np.asarray([np.frombuffer(s.encode('utf-8'), dtype=np.uint8)
                     for s in sequences])


scaler = StandardScaler()
X_std = scaler.fit_transform(X)


from Bio.Cluster import pca


%%time
columnmean, coordinates, components, eigenvalues = pca(X_std)

# original matrix = columnmean + np.dot(coordinates, components)

CPU times: user 19.2 s, sys: 82.6 ms, total: 19.3 s
Wall time: 19.3 s


# normalize eigenvalues
ev_norm = eigenvalues/eigenvalues.sum()
sum(ev_norm), ev_norm[:5]

(0.9999999999999993,
 array([0.01879619, 0.0181879 , 0.01686088, 0.01615637, 0.01551226]))


ev_norm_cumsum = ev_norm.cumsum()
ev_norm_cumsum[:5]

array([0.01879619, 0.03698408, 0.05384496, 0.07000133, 0.08551359])


value = 0.9 # 90% variance explained
idx = (np.abs(ev_norm_cumsum - value)).argmin()
idx, ev_norm_cumsum[idx]

(258, 0.9001994506279246)


X_std_pca = X_std[:,:idx]
X_std_pca.shape

(400, 258)


# try other things
X_std_pca_copy = copy.deepcopy(X_std_pca)
X_std_pca = X # stdscaler gives 4 clusters good visualization, takes long time for unscaled data.

X_std_pca = X_std # 6 clusters gives good visualization for pca2 standardscaling

X_std_pca = X_std_pca_copy


from tqdm import trange


# within-cluster sum of squares
wcss = {}
sils = {}

max_cluster = 20
for i in trange(2, max_cluster):
    kmeans = KMeans(n_clusters=i,init="k-means++",max_iter=500,
                    n_init=10,random_state=SEED)
    kmeans.fit(X_std_pca)

    # wcss
    w = kmeans.inertia_
    wcss[i] = w

    # silhoutte score
    s = silhouette_score(X_std_pca, kmeans.labels_, metric='euclidean')
    sils[i] = s

#     title = f"Num of clusters = {i}, WCSS = {w:,.0f} Silhouette Score = {s:.4f}"
#     skpmetrics.plot_silhouette(X,kmeans.labels_,title=title)

100%|██████████| 18/18 [00:01<00:00, 14.74it/s]


plt.rcParams['figure.figsize'] = [10, 5]

plt.scatter(x=wcss.keys(), y=wcss.values())
plt.plot(wcss.keys(), wcss.values())
plt.title("WCSS vs number of clusters")
plt.xticks(list(range(max_cluster+1)))
plt.show()


# I don't see any elbow like structure, wcss keeps decreasing
# larger number of clusters looks better.


plt.scatter(x=sils.keys(), y=sils.values())
plt.plot(sils.keys(), sils.values())
plt.title("Silhouette Scores vs number of clusters")
plt.xticks(list(range(max_cluster+1)))
plt.show()


# I see silhouette scroe keeps increasing
# larger number of clusters looks better.


n_clusters = 5

kmeans = KMeans(n_clusters=n_clusters,random_state=SEED,init='k-means++')
kmeans_labels = kmeans.fit_predict(X_std_pca)
pd.Series(kmeans_labels).value_counts().to_frame('count').style.background_gradient()


from yellowbrick.cluster import SilhouetteVisualizer

model = KMeans(n_clusters, random_state=SEED)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')

visualizer.fit(X_std_pca)
visualizer.show()
plt.show()


centroids = kmeans.cluster_centers_

pca_2 = PCA(n_components=2)
pca_2.fit(X_std_pca)

X_pca2 = pca_2.transform(X_std_pca)
centroids_pca = pca_2.transform(centroids)


plt.scatter(X_pca2[:, 0],X_pca2[:, 1],c=kmeans_labels,cmap='plasma')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA visualization',fontweight='bold')
plt.show()


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 22 min 35 secs

Table of Contents

Introduction¶

Report¶

Project Description¶

Methodology¶

Results¶

Load the modules¶

Look at the data file¶

Modelling¶

Convert Sequences to Numerical Data¶

k-mer and bag words embedding¶

Integer ASCII Encoding (I used this)¶

Normalize the data¶

Clustering of the Data¶

Clustering on full dimension 400*34k¶

Visualization using PCA on un-scaled original data¶

Reduce dimension of data, Normalize and Clustering¶

Find the best number of clusters¶

Time Taken¶

Cluster	Count
3	171
0	161
2	51
4	9
1	8

	count
3	171
0	161
2	51
4	9
1	8