import os
import sys
import time

time_start_notebook = time.time()


%tensorflow_version 1.x

# bert server needs tf 1.x

TensorFlow 1.x selected.


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install watermark
    !pip install scikit-plot
    !pip install lrcurve

    #!pip install tensorflow-addons[tensorflow] # addons need tensorflow > 2

    !pip install bert-serving-client
    !pip install -U bert-serving-server[http]


# data science
import numpy as np
import pandas as pd

# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# mixed
import os
import time
import random
import subprocess
import joblib
from tqdm import tqdm, trange
from pprint import pprint

# random state
SEED=100
np.random.seed(SEED)

# machine learning
import sklearn

# deep learning
import tensorflow as tf
# import tensorflow_addons as tfa # tfa needs tensorflow > 2
from keras import backend as K
import lrcurve

# model eval
import scikitplot as skplt

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-12-05 

CPython 3.6.9
IPython 5.5.0

compiler   : GCC 8.4.0
system     : Linux
release    : 4.19.112+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit

WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/watermark/watermark.py:237: The name tf.VERSION is deprecated. Please use tf.version.VERSION instead.

sklearn    0.22.2.post1
pandas     1.1.4
matplotlib 3.2.2
joblib     0.17.0
watermark  2.0.2
scikitplot 0.3.7
tensorflow 1.15.2
numpy      1.18.5
seaborn    0.11.0

Using TensorFlow backend.


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


!nvidia-smi

Sat Dec  5 03:08:33 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    28W /  70W |   1311MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+


%%capture

download_train_test = False

if download_train_test:
    !wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/Jigsaw_Toxic_Comment_Classification/train.csv.zip?raw=true
    !unzip train.csv.zip?raw=true

    !wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/Jigsaw_Toxic_Comment_Classification/test.csv.zip?raw=true
    !unzip test.csv.zip?raw=true


os.listdir()

['.config',
 'tmpSFdqY6',
 'out.file',
 'uncased_L-12_H-768_A-12',
 'train.csv',
 'tmpnRuMAo',
 'tmpwNQgfx',
 'x_train.npz',
 'tmp2dX35X',
 'test.csv.zip?raw=true',
 'tmpciV4sx',
 'uncased_L-12_H-768_A-12.zip',
 'x_test.npz',
 'test.csv',
 'tmpnMCIdP',
 'tmpNSYuIf',
 'tmp35folG',
 'tmpc7BNQf',
 'train.csv.zip?raw=true',
 'tmprZreQ6',
 'sample_data']


df_train = pd.read_csv('train.csv')
df_train.head()


df_test = pd.read_csv('test.csv')
df_test.head()


col_text = 'comment_text'
col_txt = col_text


# unique text
df_train[col_text].nunique(), df_train.shape[0]

(127656, 127656)


# null values
df_train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


cols_label = ['toxic', 'severe_toxic', 'obscene',
              'threat', 'insult', 'identity_hate']


print('Count of 1 per label: \n', df_train[cols_label].sum(), '\n') 
print('Count of 0 per label: \n', df_train[cols_label].eq(0).sum())

Count of 1 per label: 
 toxic            12202
severe_toxic      1282
obscene           6782
threat             379
insult            6292
identity_hate     1136
dtype: int64 

Count of 0 per label: 
 toxic            115454
severe_toxic     126374
obscene          120874
threat           127277
insult           121364
identity_hate    126520
dtype: int64


df_train[cols_label].sum().plot.bar(title='Count of 1s',color='tomato');


df_train[cols_label].eq(0).sum().plot.bar(title='Count of 0s');


# shuffle data
df_train = df_train.sample(frac=1,random_state=SEED).reset_index(drop=True)


df_train[col_text] = df_train[col_text].str.lower()
df_test[col_text]  = df_test[col_text].str.lower()


from bert_serving.client import BertClient

x_train = df_train[col_text].tolist()
x_test = df_test[col_text].tolist()


do_tokenize = False

if do_tokenize:

    !wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
    !unzip uncased_L-12_H-768_A-12.zip

    !nohup bert-serving-start -model_dir=./uncased_L-12_H-768_A-12 -max_seq_len 50 > out.file 2>&1 &

    # Start the BERT client
    bc = BertClient()
    x_train = bc.encode(x_train)
    x_test = bc.encode(x_test)

    # persist data
    np.savez_compressed('x_train.npz',x_train=x_train)
    np.savez_compressed('x_test.npz',x_test=x_test)


# print(bc.status)
# print()
# print(bc.server_status)


x_train = np.load('x_train.npz')['x_train']
x_test = np.load('x_test.npz')['x_test']

df_train.shape, x_train.shape

((127656, 8), (127656, 768))


!du -sh x_train.npz
!du -sh x_test.npz

347M	x_train.npz
87M	x_test.npz


class_names = ['toxic', 'severe_toxic', 'obscene',
               'threat', 'insult', 'identity_hate']

x_train = x_train # tokenized array
x_test  = x_test # tokenized array

y_train = df_train[class_names].to_numpy()
y_test = df_test[class_names].to_numpy()

df_train.shape, np.array(x_train).shape

((127656, 8), (127656, 768))


import tensorflow as tf # bert server needs 1.x
import keras.backend as K


def binary_focal_loss(gamma=2., alpha=.25):
    """
    Binary form of focal loss.
      FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t)
      where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, respectively.
    References:
        https://arxiv.org/pdf/1708.02002.pdf
    Usage:
     model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"], optimizer=adam)
    """

    def binary_focal_loss_fixed(y_true, y_pred):
        """
        :param y_true: A tensor of the same shape as `y_pred`
        :param y_pred:  A tensor resulting from a sigmoid
        :return: Output tensor.
        """
        y_true = tf.cast(y_true, tf.float32)
        # Define epsilon so that the back-propagation will not result in NaN for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        # y_pred = y_pred + epsilon
        # Clip the prediciton value
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)
        # Calculate p_t
        p_t = tf.where(K.equal(y_true, 1), y_pred, 1 - y_pred)
        # Calculate alpha_t
        alpha_factor = K.ones_like(y_true) * alpha
        alpha_t = tf.where(K.equal(y_true, 1), alpha_factor, 1 - alpha_factor)
        # Calculate cross entropy
        cross_entropy = -K.log(p_t)
        weight = alpha_t * K.pow((1 - p_t), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.mean(K.sum(loss, axis=1))
        return loss

    return binary_focal_loss_fixed


N_FEATS = np.array(x_train).shape[-1]
NUM_CLASSES = 6
class MultiLabelClassifier:

    def __init__(self):
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.Input(shape=(N_FEATS,)))
        self.model.add(tf.keras.layers.Dense(128, activation='relu'))
        self.model.add(tf.keras.layers.Dropout(rate=0.2))
        self.model.add(tf.keras.layers.Dense(64, activation='relu'))
        self.model.add(tf.keras.layers.Dense(32, activation='relu'))
        self.model.add(tf.keras.layers.Dropout(rate=0.8))
        self.model.add(tf.keras.layers.Dense(NUM_CLASSES, activation='sigmoid'))
        self.model.summary()

    def train(self, x_train, y_train, x_test, y_test,
              optimizer='nadam',
              loss=[tf.keras.losses.BinaryCrossentropy()],
              metrics=[tf.keras.metrics.AUC(name='auc')],
              batch_size=128, epochs=100):

        if metrics == 'auc':
            metrics = [tf.keras.metrics.AUC(name='auc')]

        # compile
        self.model.compile(optimizer, loss, metrics)

        history = self.model.fit(
            x_train, y_train,
            batch_size, epochs,
            validation_split=0.2,
            callbacks=[lrcurve.KerasLearningCurve()],
            verbose=0
            )
        results = self.model.evaluate(x_test, y_test, batch_size)
        ypreds = self.model.predict(x_test)
        for name, value in zip(self.model.metrics_names, results):
            print("%s: %.3f" % (name, value))
        return history, results, ypreds

WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


%%time

# loss = [binary_focal_loss(gamma=3.0, alpha=0.5)] # this gave me f1-score = 0 for all
loss=[tf.keras.losses.BinaryCrossentropy()]

batch_size = 1024 # keep it large for imbalanced data
epochs = 40

classifier = MultiLabelClassifier()
history, results,yprobs = classifier.train(
    x_train,y_train,
    x_test,y_test,
    optimizer='adam',
    loss=loss,
    metrics='auc',
    batch_size=batch_size,
    epochs=epochs
    )

print(f"""
x_train: {len(x_train)}
x_test : {len(x_test)}
""")

# NOTE: The validation loss should be decreasing

WARNING:tensorflow:Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 128)               98432     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 198       
=================================================================
Total params: 108,966
Trainable params: 108,966
Non-trainable params: 0
_________________________________________________________________
WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

31915/31915 [==============================] - 0s 4us/sample - loss: 0.0775 - auc: 0.9508
loss: 0.078
auc: 0.951

x_train: 127656
x_test : 31915

CPU times: user 29.4 s, sys: 1.91 s, total: 31.4 s
Wall time: 26.9 s


from sklearn import metrics as skmetrics
from sklearn.metrics import multilabel_confusion_matrix

print(yprobs[:5])
ypreds = (yprobs == yprobs.max(axis=1,keepdims=True)).astype(np.int8)

# if a row has all ONES, make it zero
# a row can't be all 1's
ypreds[np.all(ypreds,axis=1,keepdims=False)] = 0
print(ypreds[:5])

f1= skmetrics.f1_score(df_test[class_names], ypreds,average='micro')
acc = skmetrics.accuracy_score(df_test[class_names], ypreds)

print(f'F1-score (micro)  : {f1:.4f}')
print(f'Accuracy (overall): {acc:.4f}')

mcm = multilabel_confusion_matrix(df_test[class_names], ypreds)
# make sure second diagonal element is NOT ZERO!!
print(mcm)

[[6.9469213e-05 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [2.9802322e-08 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [1.8524528e-03 0.0000000e+00 1.1026859e-06 0.0000000e+00 5.9604645e-08
  0.0000000e+00]
 [2.5524199e-03 0.0000000e+00 2.2351742e-06 0.0000000e+00 3.2782555e-07
  0.0000000e+00]]
[[1 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]]
F1-score (micro)  : 0.1784
Accuracy (overall): 0.1719
[[[ 4315 24508]
  [    4  3088]]

 [[31602     0]
  [  313     0]]

 [[30248     0]
  [ 1667     0]]

 [[31816     0]
  [   99     0]]

 [[30330     0]
  [ 1585     0]]

 [[31646     0]
  [  269     0]]]


from scikitplot.metrics import plot_confusion_matrix

for i,class_name in enumerate(class_names):
    plot_confusion_matrix(df_test[class_name],ypreds[:,i],title=class_name)


clf_report = skmetrics.classification_report(
    df_test[class_names], ypreds,
    target_names=class_names)

print(clf_report)

               precision    recall  f1-score   support

        toxic       0.11      1.00      0.20      3092
 severe_toxic       0.00      0.00      0.00       313
      obscene       0.00      0.00      0.00      1667
       threat       0.00      0.00      0.00        99
       insult       0.00      0.00      0.00      1585
identity_hate       0.00      0.00      0.00       269

    micro avg       0.11      0.44      0.18      7025
    macro avg       0.02      0.17      0.03      7025
 weighted avg       0.05      0.44      0.09      7025
  samples avg       0.10      0.06      0.07      7025

/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


coo = np.array(df_test[class_names]).T.dot(np.array(ypreds))
df_coo = pd.DataFrame(coo, columns=class_names,index=class_names)
df_coo


df_coo2 = df_coo.copy()
df_coo2['Total'] = df_coo2.sum(axis=1)
df_coo2.loc[len(df_coo2),:] = df_coo2.sum(axis=0)
df_coo2.index = df_coo.index.tolist() + ['Total']
df_coo2 = df_coo2.astype(int)

# horizontal is true, vertical is predicted
df_coo2


def highlight_diagf(dfx, color="khaki"):
    def highlight_diag(dfy):
        a = np.full(dfy.shape, "", dtype="<U24")
        np.fill_diagonal(a, f"background-color: {color}")
        df1 = pd.DataFrame(a, index=dfy.index, columns=dfy.columns)
        return df1

    return dfx.style.apply(highlight_diag, axis=None)


highlight_diagf(df_coo2)

	id	comment_text
0	8d603d50affa1126	"\nYes, aside, thank you for trying to answer ...
1	8fb3576937b9e0d0	March 2010 (UTC)\n\nThanks! and understood abo...
2	379440e04fb68e27	"\n\n The Outfield \n\nHahaha - compassion is ...
3	6be4446aac8ae028	Opposition is a source of strength. I believe ...
4	1a2ff7ed958506a3	Please discontinue making those unsupported ch...

	id	comment_text
0	70bbc3e96dd459b1	Hammed it is, cheers!
1	0b2e86f819b4b9a4	Not a problem, sorry for the inconvenience and...
2	fb7a63a8e287b2d1	Sources for Gambia at the 2000 Summer Olympics...
3	72beff75685cb8dc	Added some criticism \n\nI added some much nee...
4	f7c526a05d03f697	Who cares? It's just a song, it's a great song...

Description¶

Load the Libraries¶

Useful Functions¶

GPU Testing¶

Load Training Data¶

Data Processing: Training Data¶

Shuffle data¶

Lowercase¶

Tokenize using Bert Client¶

Get the data¶

Data Preparation for Modelling¶

Modelling: keras¶

Modellng: Keras¶

Model Evaluation¶

Confusion Matrix¶

Classification Report¶

Co-occurence Matrix¶

	toxic	severe_toxic	obscene	threat	insult	identity_hate
toxic	3088	0	0	0	0	0
severe_toxic	313	0	0	0	0	0
obscene	1666	0	0	0	0	0
threat	99	0	0	0	0	0
insult	1583	0	0	0	0	0
identity_hate	269	0	0	0	0	0