import os
import sys
import time

time_start_notebook = time.time()


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install -U sklearn
    !pip install watermark
    !pip install tqdm
    !pip install scikit-plot


import numpy as np
import pandas as pd
from tqdm import tqdm

# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# mixed
import os
import time
from pprint import pprint
import joblib

# random state
SEED=100
np.random.seed(SEED)

# machine learning
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# deep learning
import tensorflow
import keras
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

# model eval
import scikitplot as skplt

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-12-01 

CPython 3.7.7
IPython 7.19.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

seaborn    0.10.1
tensorflow 2.3.0
watermark  2.0.2
matplotlib 3.2.1
pandas     1.1.1
scikitplot 0.3.7
sklearn    0.23.2
keras      2.4.3
joblib     0.17.0
numpy      1.19.4


def show_methods(obj, ncols=4):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


# data
dat_dir = os.path.join('..','data')

path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'


if ENV_COLAB:
    dat_dir = os.path.join('..','data')
    r = '?raw=true'

    path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
    path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
    path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
    path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
    compression = 'zip'


df_train = pd.read_csv(path_data_train,compression=compression)
df_test = pd.read_csv(path_data_test,compression=compression)
print(df_train.shape)
print(df_train.columns)

display(df_train.head(2).append(df_train.tail(2)))

(127656, 8)
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')


dir_embed_file = os.path.join(os.path.expanduser('~'),'Datasets','NLP')


path_embed_file = os.path.join(dir_embed_file,'crawl-300d-2M.vec')

print(path_embed_file)

/Users/poudel/Datasets/NLP/crawl-300d-2M.vec


maincol = 'comment_text'
targets = ["toxic", "severe_toxic", "obscene", "threat",
           "insult", "identity_hate"]
Xtrain = df_train[maincol].to_numpy()
ytrain = df_train[targets].to_numpy()

Xtest = df_test[maincol].to_numpy()
ytest = df_test[targets].to_numpy()


max_features = 30_000
maxlen = 100
embed_size = 300


%%time
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(Xtrain) + list(Xtest))

Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtest = tokenizer.texts_to_sequences(Xtest)

Xtrain = sequence.pad_sequences(Xtrain, maxlen=maxlen)
Xtest = sequence.pad_sequences(Xtest, maxlen=maxlen)

CPU times: user 23.6 s, sys: 189 ms, total: 23.8 s
Wall time: 25.4 s


def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


%%time
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' '))
                        for o in open(path_embed_file))

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((num_words, embed_size))

for word, i in tqdm(word_index.items()):
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
# Wall time: 1min 46s

100%|██████████| 210337/210337 [00:00<00:00, 436136.92it/s]

CPU times: user 2min 54s, sys: 8.18 s, total: 3min 2s
Wall time: 3min 38s


from keras.callbacks import Callback
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model(maxlen, max_features, embed_size,embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size,
                  weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model(maxlen, max_features, embed_size,embedding_matrix)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 300)     9000000     input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 100, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 160)     183360      spatial_dropout1d[0][0]          
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 160)          0           bidirectional[0][0]              
__________________________________________________________________________________________________
global_max_pooling1d (GlobalMax (None, 160)          0           bidirectional[0][0]              
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 320)          0           global_average_pooling1d[0][0]   
                                                                 global_max_pooling1d[0][0]       
__________________________________________________________________________________________________
dense (Dense)                   (None, 6)            1926        concatenate[0][0]                
==================================================================================================
Total params: 9,185,286
Trainable params: 9,185,286
Non-trainable params: 0
__________________________________________________________________________________________________


%%time
batch_size = 32
epochs = 2
interval=1

X_tra, X_val, y_tra, y_val = train_test_split(Xtrain, ytrain,
                                        test_size=0.1, random_state=SEED)

validation_data=(X_val, y_val) # names must be X_val and y_val
                               # as used in class RocAucEvaluation

# callbacks
cb_rocauc = RocAucEvaluation(
    validation_data=(Xtest, ytest),
    interval=interval)

path_keras_model = '../outputs/keras_model.h5'


DO_TRAIN = False
if DO_TRAIN:
    history = model.fit(X_tra, y_tra,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=validation_data,
                        callbacks=[cb_rocauc],
                        verbose=2
                       )
    model.save(path_keras_model)

# CPU times: user 1h 2min 2s, sys: 23min 57s, total: 1h 26min
# Wall time: 31min 44s

CPU times: user 47 ms, sys: 34.9 ms, total: 81.9 ms
Wall time: 129 ms


# !du -sh $path_keras_model
# 120M	../outputs/keras_model.h5


%%time

path_yprobs_keras = '../outputs/yprobs_keras.npz'
if DO_TRAIN:
    model = keras.models.load_model(path_keras_model)
    yprobs = model.predict(Xtest,batch_size=1024,verbose=2)
    np.savez_compressed(path_yprobs_keras, yprobs=yprobs)

# Wall time: 1min 33s

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 8.82 µs


yprobs = np.load(path_yprobs_keras)['yprobs']
yprobs[:5]

array([[1.6023219e-03, 1.8933415e-04, 8.7299943e-04, 5.9655915e-05,
        6.2906742e-04, 9.4687384e-05],
       [2.7573109e-04, 6.9994370e-05, 3.5297871e-04, 7.7578179e-06,
        4.9650669e-04, 1.8649160e-05],
       [4.0832162e-04, 4.5213445e-05, 4.0152669e-04, 1.5648015e-05,
        2.9772520e-04, 3.9751460e-05],
       [1.1048913e-03, 1.0009683e-04, 2.7105212e-04, 2.1384854e-05,
        1.6108155e-04, 4.2530606e-05],
       [4.1490495e-03, 6.4332758e-05, 1.1196733e-03, 7.6643073e-06,
        2.8944016e-04, 2.1070426e-05]], dtype=float32)


labels = ['toxic', 'severe_toxic', 'obscene', 'threat',
          'insult', 'identity_hate']


ypreds = (yprobs==yprobs.max(axis=1,keepdims=True)).astype(np.int8)
ypreds[:5]

array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]], dtype=int8)


ytest.shape, yprobs.shape, ypreds.shape

((31915, 6), (31915, 6), (31915, 6))


from sklearn.metrics import multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(ytest, ypreds)
mcm

array([[[ 3212, 25611],
        [   66,  3026]],

       [[31602,     0],
        [  313,     0]],

       [[27486,  2762],
        [ 1620,    47]],

       [[31809,     7],
        [   98,     1]],

       [[30001,   329],
        [ 1584,     1]],

       [[31514,   132],
        [  269,     0]]])


ytest.shape[0], mcm.sum(axis=1).sum(axis=1)

(31915, array([31915, 31915, 31915, 31915, 31915, 31915]))


for i in range(6):
    skplt.metrics.plot_confusion_matrix(ytest[:,i],ypreds[:,i],title=labels[i])


r = sklearn.metrics.classification_report(ytest,ypreds)
print(r)

              precision    recall  f1-score   support

           0       0.11      0.98      0.19      3092
           1       0.00      0.00      0.00       313
           2       0.02      0.03      0.02      1667
           3       0.12      0.01      0.02        99
           4       0.00      0.00      0.00      1585
           5       0.00      0.00      0.00       269

   micro avg       0.10      0.44      0.16      7025
   macro avg       0.04      0.17      0.04      7025
weighted avg       0.05      0.44      0.09      7025
 samples avg       0.10      0.06      0.07      7025

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


# check the evaluation for toxic (0th label)
df_toxic = pd.DataFrame({'ytest': ytest[:,0], 'ypred': ypreds[:,0]})
df_toxic.query('ytest == 1').head(4)


pd.crosstab(df_toxic['ytest'],df_toxic['ypred'])


pre0 = sklearn.metrics.precision_score(ytest[:,0],ypreds[:,0])
pre1 = sklearn.metrics.precision_score(ytest[:,1],ypreds[:,1])
pre2 = sklearn.metrics.precision_score(ytest[:,2],ypreds[:,2])

pre0, pre1, pre2

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

(0.10566749310332786, 0.0, 0.01673193307226771)


coo = ytest.T.dot(ypreds)
coo

array([[3026,    0,   58,    2,    4,    2],
       [ 311,    0,    1,    1,    0,    0],
       [1618,    0,   47,    1,    1,    0],
       [  98,    0,    0,    1,    0,    0],
       [1570,    0,   14,    0,    1,    0],
       [ 267,    0,    1,    0,    1,    0]])


df_coo = pd.DataFrame(coo, columns=labels,index=labels)
df_coo.to_csv('../outputs/keras_confusion_matrix.csv',index=False)
df_coo.style.background_gradient()

# only diagonal should be dark color


df_coo2 = df_coo.copy()
df_coo2['Total'] = df_coo2.sum(axis=1)
df_coo2.loc[len(df_coo2),:] = df_coo2.sum(axis=0)
df_coo2.index = df_coo.index.tolist() + ['Total']
df_coo2 = df_coo2.astype(int)

# horizontal is true, vertical is predicted
# df_coo2


from util_ds import highlight_rcd
highlight_rcd(df_coo2)


from util_multilabel import plot_confusion_matrix


plot_confusion_matrix(coo, target_names=labels,
                      cmap='Reds',normalize=False)


labels = ['toxic','severe_toxic','obscene',
          'threat','insult','identity_hate']


from util_plotly import plot_coo_matrix
plot_coo_matrix(labels,coo)

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning:

plotly.graph_objs.Margin is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Margin


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 4 min 17 secs

	id	comment_text
0	8d603d50affa1126	"\nYes, aside, thank you for trying to answer ...
1	8fb3576937b9e0d0	March 2010 (UTC)\n\nThanks! and understood abo...
127654	95df37d4a69b607d	I am assuming that there is no point trying to...
127655	668ba87c1b6a3f31	"\nPlus, take a look! Have I made any outing ...

Description¶

Load the libraries¶

Useful Functions¶

Parameters¶

Load the Data¶

Word Embeddings¶

Data Processing¶

Parameters¶

Text Data Processing¶

Model Evaluation¶

multilabel confusion matrix¶

classification report¶

Co-occurrence Matrix¶

Plotly Visualization¶

Time Taken¶

	toxic	obscene	threat	insult	identity_hate
toxic	3026	58	2	4	2
severe_toxic	311	1	1	0	0
obscene	1618	47	1	1	0
threat	98	0	1	0	0
insult	1570	14	0	1	0
identity_hate	267	1	0	1	0

	toxic	obscene	threat	insult	identity_hate	Total
toxic	3026	58	2	4	2	3092
severe_toxic	311	1	1	0	0	313
obscene	1618	47	1	1	0	1667
threat	98	0	1	0	0	99
insult	1570	14	0	1	0	1585
identity_hate	267	1	0	1	0	269
Total	6890	121	5	7	2	7025

	ytest	ypred
13	1	1
22	1	1
31	1	1
32	1	1

ypred	0	1
ytest
0	3212	25611
1	66	3026