import os
import sys
import time

time_start_notebook = time.time()


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install -U sklearn
    !pip install watermark
    !pip install tqdm
    !pip install scikit-plot

    ## mount google drive
    from google.colab import drive
    drive.mount('/content/drive')

    ## load the data dir
    ## dat_dir is train/test data from github
    colab_dat_dir = 'drive/MyDrive/Colab Notebooks/data/'
    sys.path.append(colab_dat_dir)

    ## Image dir
    colab_img_dir = 'drive/MyDrive/Colab Notebooks/images/'
    if not os.path.isdir(colab_img_dir): os.makedirs(colab_img_dir)
    sys.path.append(colab_img_dir)

    ## Output dir
    colab_out_dir = 'drive/MyDrive/Colab Notebooks/outputs/'
    if not os.path.isdir(colab_out_dir): os.makedirs(colab_out_dir)
    sys.path.append(colab_out_dir)


# data science
import numpy as np
import pandas as pd
from tqdm import tqdm

# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# mixed
import os
import time
from pprint import pprint
import joblib

# random state
SEED=100
np.random.seed(SEED)

# machine learning
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# deep learning
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.layers import Input, Dense, GRU, Dropout
from keras.layers import Bidirectional, Embedding, SpatialDropout1D, concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping, ModelCheckpoint

# nlp
import nltk
import gensim.models as gsm
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from nltk.corpus import words as nltk_words

# model eval
import scikitplot as skplt

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-12-01 

CPython 3.7.7
IPython 7.19.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

joblib     0.17.0
keras      2.4.3
nltk       3.4.4
sklearn    0.23.2
numpy      1.19.4
matplotlib 3.2.1
seaborn    0.10.1
pandas     1.1.1
watermark  2.0.2
scikitplot 0.3.7


def show_methods(obj, ncols=4):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


# data
dat_dir = os.path.join('..','data')

path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'


if ENV_COLAB:
    dat_dir = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/Jigsaw_Toxic_Comment_Classification/"
    r = '?raw=true'

    path_data_train = dat_dir + 'train.csv.zip' + r
    path_data_test = dat_dir + 'test.csv.zip' + r
    compression = 'zip'


df_train = pd.read_csv(path_data_train,compression=compression)
df_test = pd.read_csv(path_data_test,compression=compression)

print(f"""
df_train  : {df_train.shape}
df_test   : {df_test.shape}

Features: {df_train.columns.tolist()}

""")

display(df_train.head(2).append(df_train.tail(2)))

df_train  : (127656, 8)
df_test   : (31915, 8)

Features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


# debug
# df_train = df_train.sample(n=50_000,random_state=SEED)


dir_embed_file = os.path.join(os.path.expanduser('~'),'Datasets','NLP')


if ENV_COLAB:
    dir_embed_file = colab_dat_dir + 'NLP'
    print(dir_embed_file)


path_embed_file = os.path.join(dir_embed_file,'crawl-300d-2M.vec')
print(path_embed_file)

/Users/poudel/Datasets/NLP/crawl-300d-2M.vec


!ls "$dir_embed_file"

crawl-300d-2M.vec     crawl-300d-2M.vec.zip


def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def standardize_text(df, text_field):
    """
    Use a few regular expressions to clean up pour data.
    """
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

def similarity(s1, s2):
    """
    Find Similarity between two strings.
    Return a measure of the sequences' similarity (float in [0,1]).
    """
    return SequenceMatcher(None, s1, s2).ratio()

def is_english_word(dictionary, word):
    """
    Check if a word is an English word.
    """
    try:
        x = dictionary[word]
        return True
    except KeyError:
        return False

def normalize_bad_word(word_list, dictionary, bad_words, threshold):
    """
    Return a list of normalized words.
    """
    res = []

    for word in word_list:
        found = False
        normalizedBadWord = ""
        for badword in bad_words:
            #if(similarity(badword, word) > threshold):
            if(badword in word):
                found = True
                normalizedBadWord = badword
                break;                
        if(found):
            res.append(normalizedBadWord)      
        else:
            if(is_english_word(dictionary, word)):
                res.append(word)        
    #print(res) 
    return res


%%time
# TRAIN
processing = """
1. fill nans by _NA_
2. standardize text

(r"http\S+", "")
(r"http", "")
(r"@\S+", "")
(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
(r"@", "at")
lower()

3. tokenize (RegexpTokenizer(r'[a-zA-Z]+'))
4. remove stopwords
5. normalize bad words
6. save clean file
7. combine tokens by " "

"""

def create_tokens(dfx,col_txt,
                  tokenizer,
                  stop_words,
                  dictionary,
                  bad_words,
                  print_=True
                  ):
    dfx = dfx.copy()

    # fill nans
    dfx[col_txt] = dfx[col_txt].fillna('_NA_')
    
    # standardize
    dfx = standardize_text(dfx, col_txt)
    
    # create tokens column
    dfx["tokens"] = dfx[col_txt].apply(tokenizer.tokenize)

    # remove stopwords
    dfx["tokens"] = dfx["tokens"].apply(
        lambda vec: [word for word in vec if word not in stop_words])

    # normalize bad words    
    dfx["tokens"] = dfx["tokens"].apply(
        lambda vec: normalize_bad_word(vec, dictionary, bad_words, 0.5))

    # print info
    if print_:
        all_words = [word for tokens in dfx["tokens"] for word in tokens]
        sent_lengths = [len(tokens) for tokens in dfx["tokens"]]
        VOCAB = sorted(list(set(all_words)))
        
        print(f"""
        Total words     : {len(all_words):,}
        Vocab size      : {len(VOCAB):,}
        Len longest sent: {max(sent_lengths)}
        Mean sent length: {np.mean(sent_lengths):.2f}
        """)

    # join words by space
    dfx["tokens"] = dfx["tokens"].apply(lambda vec :' '.join(vec))
    
    return dfx

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.05 µs


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/poudel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True


%%time

print('Preparing Dictionary...')

# Read the FastText word vectors (space delimited strings) into a dictionary from word->vector
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(path_embed_file, encoding="utf8"))
print("embeddings_index size: ", len(embeddings_index))

dictionary = dict.fromkeys(embeddings_index, None)
print("Dictionary size: ", len(dictionary))

# Wall time: 1min 49s

Preparing Dictionary...
embeddings_index size:  2000000
Dictionary size:  2000000
CPU times: user 1min 49s, sys: 3.89 s, total: 1min 53s
Wall time: 1min 59s


stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')


bad_words = ['sex', 'suck', 'anal', 'penis', 'shit', 'fuck', 'damn',
    'bitch', 'crap', 'piss', 'dick', 'darn', 'cock', 'pussy','ass',
    'asshole', 'fag', 'bastard', 'slut', 'douche','bastard', 'darn',
    'bloody', 'bugger', 'bollocks', 'arsehole','nigger', 'nigga',
    'moron', 'gay', 'antisemitism', 'anti','nazi', 'poop']

print("num bad words: ", len(bad_words))

num bad words:  34


# This will create new column "tokens"
df_train = create_tokens(df_train,
                         col_txt='comment_text',
                         tokenizer=tokenizer,
                         stop_words=stop_words,
                         dictionary=dictionary,
                         bad_words=bad_words,
                         print_=True
                         )

df_test = create_tokens(df_test,
                         col_txt='comment_text',
                         tokenizer=tokenizer,
                         stop_words=stop_words,
                         dictionary=dictionary,
                         bad_words=bad_words,
                         print_=True
                         )

df_train.head()

        Total words     : 4,279,556
        Vocab size      : 86,638
        Len longest sent: 1250
        Mean sent length: 33.52
        

        Total words     : 1,044,479
        Vocab size      : 46,933
        Len longest sent: 1250
        Mean sent length: 32.73


max_features = 80_000  # how many unique words to use (i.e num rows in embedding vector)
maxlen = 150 # max number of words in a comment to use
embed_size = 300 # how big is each word vector


%%time

from keras.preprocessing.text import Tokenizer

# Turn each comment into a list of word indexes
# of equal length (with truncation or padding as needed)
list_classes = ["toxic", "severe_toxic", "obscene",
                "threat", "insult", "identity_hate"]

list_sentences_train = list(df_train["tokens"].to_numpy())
list_sentences_test  = list(df_test["tokens"].to_numpy())

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train)
                       + list(list_sentences_test))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

Xtrain_text = pad_sequences(list_tokenized_train, maxlen=maxlen)
Xtest_text = pad_sequences(list_tokenized_test, maxlen=maxlen)

CPU times: user 8.45 s, sys: 74.9 ms, total: 8.53 s
Wall time: 9.12 s


%%time
path_missing_words = '../outputs/word_not_found.csv'

# BUILD EMBEDDING MATRIX    
print('Preparing embedding matrix...')

word_index = tokenizer.word_index
print("word_index size: ", len(word_index))

words_not_found = []
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():        
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0 :
        embedding_matrix[i-1] = embedding_vector
    else:
        words_not_found.append(word)

if(len(words_not_found)>0):        
    print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    print("sample words not found: ", np.random.choice(words_not_found, 10))
    df_missing_words = pd.DataFrame(words_not_found)
    df_missing_words.to_csv(path_missing_words, header=None, index=False)
    
    
# prints
print(f'Number of words: {nb_words}')

Preparing embedding matrix...
word_index size:  94298
Number of words: 80000
CPU times: user 278 ms, sys: 378 ms, total: 656 ms
Wall time: 708 ms


dict(list(word_index.items())[0: 3]) 
# number starts with 1 not 0, so we use,
# embedding_matrix[i-1] = embedding_vector

{'article': 1, 'page': 2, 'wikipedia': 3}


dict(list(word_index.items())[-3:])

{'piezo': 94296, 'abouta': 94297, 'andhave': 94298}


len(word_index), embedding_matrix.shape

(94298, (80000, 300))


from keras.callbacks import Callback
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        # we must have variable names X_val and y_val

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model(maxlen, max_features, embed_size, embedding_matrix,
              lr=0.0, lr_d=0.0, units=0, dr=0.0):
    """Get the tensorflow model.

    References:
    - http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
    - For text, CNN -> LSTM (or GRU) doesn't seem to work well, but LSTM -> CNN works really well.

    """
    inp = Input(shape=(maxlen,))
    # Ref: https://keras.io/api/layers/core_layers/embedding/
    # embedding needs input_dim =  maximum integer index + 1.
    x = Embedding(max_features, embed_size,
                  weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(units, return_sequences=True, dropout=dr, recurrent_dropout=dr))(x)
    x = Conv1D(filters=64, kernel_size=2, padding='valid', kernel_initializer="he_uniform")(x)
    x = Dropout(dr)(x)
    # x = MaxPooling1D(pool_size=2)(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                    optimizer=Adam(lr=lr, decay=lr_d),
                    metrics=['accuracy']) 

    return model

model = get_model(maxlen,nb_words,embed_size,embedding_matrix,
                  lr=1e-3,lr_d=0,units=128, dr=0.5)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 300)     24000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 150, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 150, 256)     330240      spatial_dropout1d[0][0]          
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 149, 64)      32832       bidirectional[0][0]              
__________________________________________________________________________________________________
dropout (Dropout)               (None, 149, 64)      0           conv1d[0][0]                     
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 64)           0           dropout[0][0]                    
__________________________________________________________________________________________________
global_max_pooling1d (GlobalMax (None, 64)           0           dropout[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 128)          0           global_average_pooling1d[0][0]   
                                                                 global_max_pooling1d[0][0]       
__________________________________________________________________________________________________
dense (Dense)                   (None, 6)            774         concatenate[0][0]                
==================================================================================================
Total params: 24,363,846
Trainable params: 24,363,846
Non-trainable params: 0
__________________________________________________________________________________________________


Xtrain = Xtrain_text
Xtest  = Xtest_text

list_classes = ["toxic", "severe_toxic", "obscene",
                "threat", "insult", "identity_hate"]
ytrain = df_train[list_classes].to_numpy()
ytest  = df_test[list_classes].to_numpy()

print(f"""
df_train :{df_train.shape}   df_test:{df_test.shape}
Xtrain   :{Xtrain.shape} Xtest  :{Xtest.shape}
ytrain   :{ytrain.shape}   ytest  :{ytest.shape}
 
""")

df_train :(127656, 9)   df_test:(31915, 9)
Xtrain   :(127656, 150) Xtest  :(31915, 150)
ytrain   :(127656, 6)   ytest  :(31915, 6)


%%time
batch_size = 128
epochs = 2
interval=1
path_model_checkpoint = '../outputs/keras_gru_fasttext_badwords.h5'

X_tra, X_val, y_tra, y_val = train_test_split(Xtrain, ytrain,
                                        test_size=0.1, random_state=SEED)

validation_data=(X_val, y_val) # names must be X_val and y_val
                               # as used in class RocAucEvaluation
# callbacks
cb_rocauc = RocAucEvaluation(
    validation_data=validation_data,
    interval=interval)

cb_early = EarlyStopping(
            monitor='val_acc',
            patience=5,
            mode='max',
            verbose=1)

cb_check = ModelCheckpoint(
            path_model_checkpoint,
            monitor='val_acc',
            save_best_only=True,
            mode='max',
            verbose=0)

callbacks = [cb_rocauc, cb_early, cb_check]

path_keras_model = '../outputs/keras_fasttext_model.h5'


DO_TRAIN = False

if DO_TRAIN:
    history = model.fit(X_tra, y_tra,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=validation_data,
                        callbacks=callbacks,
                        verbose=2,
                        shuffle=True
                       )

    model.save(path_keras_model)


# Wall time: Wall time: 20min 18s

CPU times: user 40.9 ms, sys: 5.86 ms, total: 46.8 ms
Wall time: 75.2 ms


!du -sh $path_keras_model

280M	../outputs/keras_fasttext_model.h5


%%time

path_yprobs_keras = '../outputs/yprobs_keras_fasttext.npz'
if DO_TRAIN:
    model = keras.models.load_model(path_keras_model)
    yprobs = model.predict(Xtest_text,batch_size=1024,verbose=2)
    np.savez_compressed(path_yprobs_keras, yprobs=yprobs)

# Wall time: 1min 33s

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.2 µs


yprobs = np.load(path_yprobs_keras)['yprobs']
yprobs[:5]

array([[0.01838818, 0.00341764, 0.02119815, 0.00906962, 0.01449135,
        0.00419179],
       [0.01054636, 0.00153401, 0.01218945, 0.00453353, 0.00753298,
        0.00132418],
       [0.00564146, 0.00059021, 0.00540674, 0.00278851, 0.00426516,
        0.00168946],
       [0.00870907, 0.00088844, 0.00853044, 0.00390193, 0.0061962 ,
        0.0016318 ],
       [0.01302436, 0.00158405, 0.00876519, 0.00363001, 0.00854704,
        0.00241932]], dtype=float32)


labels = ['toxic','severe_toxic','obscene',
          'threat','insult','identity_hate']


ypreds = (yprobs==yprobs.max(axis=1,keepdims=True)).astype(np.int8)
ypreds[:5]

array([[0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]], dtype=int8)


ytest.shape, yprobs.shape, ypreds.shape

((31915, 6), (31915, 6), (31915, 6))


from sklearn.metrics import multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(ytest, ypreds)
mcm

array([[[ 6919, 21904],
        [ 1211,  1881]],

       [[31602,     0],
        [  313,     0]],

       [[23886,  6362],
        [  707,   960]],

       [[31804,    12],
        [   99,     0]],

       [[30234,    96],
        [ 1585,     0]],

       [[30950,   696],
        [  264,     5]]])


ytest.shape[0], mcm.sum(axis=1).sum(axis=1)

(31915, array([31915, 31915, 31915, 31915, 31915, 31915]))


for i in range(6):
    skplt.metrics.plot_confusion_matrix(ytest[:,i],ypreds[:,i],title=labels[i])


r = sklearn.metrics.classification_report(ytest,ypreds)
print(r)

              precision    recall  f1-score   support

           0       0.08      0.61      0.14      3092
           1       0.00      0.00      0.00       313
           2       0.13      0.58      0.21      1667
           3       0.00      0.00      0.00        99
           4       0.00      0.00      0.00      1585
           5       0.01      0.02      0.01       269

   micro avg       0.09      0.41      0.15      7025
   macro avg       0.04      0.20      0.06      7025
weighted avg       0.07      0.41      0.11      7025
 samples avg       0.09      0.05      0.06      7025

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


# check the evaluation for toxic (0th label)
df_toxic = pd.DataFrame({'ytest': ytest[:,0], 'ypred': ypreds[:,0]})
df_toxic.query('ytest == 1').head(4)


pd.crosstab(df_toxic['ytest'],df_toxic['ypred'])


pre0 = sklearn.metrics.precision_score(ytest[:,0],ypreds[:,0])
pre1 = sklearn.metrics.precision_score(ytest[:,1],ypreds[:,1])
pre2 = sklearn.metrics.precision_score(ytest[:,2],ypreds[:,2])

pre0, pre1, pre2

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

(0.07908345595963842, 0.0, 0.13111171810980607)


coo = ytest.T.dot(ypreds)
coo

array([[1881,    0, 1193,    2,    2,   14],
       [ 138,    0,  174,    0,    0,    1],
       [ 703,    0,  960,    0,    0,    4],
       [  76,    0,   21,    0,    0,    2],
       [ 894,    0,  688,    0,    0,    3],
       [ 208,    0,   56,    0,    0,    5]])


df_coo = pd.DataFrame(coo, columns=labels,index=labels)
df_coo.to_csv('../outputs/keras_confusion_matrix.csv',index=False)
df_coo.style.background_gradient()


df_coo2 = df_coo.copy()
df_coo2['Total'] = df_coo2.sum(axis=1)
df_coo2.loc[len(df_coo2),:] = df_coo2.sum(axis=0)
df_coo2.index = df_coo.index.tolist() + ['Total']
df_coo2 = df_coo2.astype(int)

# horizontal is true, vertical is predicted
df_coo2


from util_ds import highlight_rcd
highlight_rcd(df_coo2)


from util_multilabel import plot_confusion_matrix


plot_confusion_matrix(coo, target_names=labels,
                      cmap='Reds',normalize=False)


from util_plotly import plot_coo_matrix
plot_coo_matrix(labels,coo)

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning:

plotly.graph_objs.Margin is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Margin


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 2 min 41 secs

	id	comment_text
0	8d603d50affa1126	"\nYes, aside, thank you for trying to answer ...
1	8fb3576937b9e0d0	March 2010 (UTC)\n\nThanks! and understood abo...
127654	95df37d4a69b607d	I am assuming that there is no point trying to...
127655	668ba87c1b6a3f31	"\nPlus, take a look! Have I made any outing ...

	id	comment_text	tokens
0	8d603d50affa1126	"\nyes, aside, thank you for trying to answer ...	yes aside thank trying answer block related co...
1	8fb3576937b9e0d0	march 2010 (utc)\n\nthanks! and understood abo...	march utc thanks understood tags advice better...
2	379440e04fb68e27	"\n\n the outfield \n\nhahaha compassion is ...	outfield hahaha ass vested considering er kind...
3	6be4446aac8ae028	opposition is a source of strength i believe ...	opposition source strength believe al said
4	1a2ff7ed958506a3	please discontinue making those unsupported ch...	please discontinue making unsupported changes ...

Description¶

Load the libraries¶

Useful Functions¶

Parameters¶

Load the Data¶

Text Data Processing¶

Word Embeddings¶

clean the text¶

Build Embedding Matrix¶

Modelling¶

Model Evaluation¶

multilabel confusion matrix¶

classification report¶

Co-occurrence Matrix¶

Plotly Visualization¶

Time Taken¶

	toxic	obscene	threat	insult	identity_hate
toxic	1881	1193	2	2	14
severe_toxic	138	174	0	0	1
obscene	703	960	0	0	4
threat	76	21	0	0	2
insult	894	688	0	0	3
identity_hate	208	56	0	0	5

	ytest	ypred
13	1	0
22	1	1
31	1	1
32	1	0

ypred	0	1
ytest
0	6919	21904
1	1211	1881