import os
import sys
import time

time_start_notebook = time.time()


%tensorflow_version 1.x

# bert server needs tf 1.x

TensorFlow 1.x selected.


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install watermark
    !pip install scikit-plot


%%capture

# Run this only once

# use gpu and tf 1.x to use bert server

!pip install bert-serving-client
!pip install -U bert-serving-server[http]

!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

!ls

sample_data  uncased_L-12_H-768_A-12  uncased_L-12_H-768_A-12.zip


# data science
import numpy as np
import pandas as pd

# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# mixed
import os
import time
import random
import subprocess
import joblib
from tqdm import tqdm, trange
from pprint import pprint

# random state
SEED=100
np.random.seed(SEED)

# machine learning
import sklearn
import xgboost as xgb


# model eval
import scikitplot as skplt

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-12-04 

CPython 3.6.9
IPython 5.5.0

compiler   : GCC 8.4.0
system     : Linux
release    : 4.19.112+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit

numpy      1.18.5
joblib     0.17.0
sklearn    0.22.2.post1
seaborn    0.11.0
scikitplot 0.3.7
watermark  2.0.2
matplotlib 3.2.2
pandas     1.1.4
xgboost    0.90


def show_methods(obj, ncols=4,contains=None):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    if contains is not None:
        lst = [i for i in lst if contains in i]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


!nvidia-smi

Fri Dec  4 18:33:58 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+


TREE_METHOD = 'auto'
try:
    import tensorflow as tf
    has_gpu = tf.test.gpu_device_name()
    TREE_METHOD = 'gpu_hist' if has_gpu else 'auto'
except:
    TREE_METHOD = 'auto'

print(TREE_METHOD)

gpu_hist


%%capture
!wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/Jigsaw_Toxic_Comment_Classification/train.csv.zip?raw=true
!unzip train.csv.zip?raw=true

!wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/Jigsaw_Toxic_Comment_Classification/test.csv.zip?raw=true
!unzip test.csv.zip?raw=true


os.listdir()

['.config',
 'uncased_L-12_H-768_A-12',
 'train.csv',
 'test.csv.zip?raw=true',
 'uncased_L-12_H-768_A-12.zip',
 'test.csv',
 'train.csv.zip?raw=true',
 'sample_data']


df_train = pd.read_csv('train.csv')
df_train.head()


df_test = pd.read_csv('test.csv')
df_test.head()


col_text = 'comment_text'
col_txt = col_text


# unique text
df_train[col_text].nunique(), df_train.shape[0]

(127656, 127656)


# null values
df_train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


cols_label = ['toxic', 'severe_toxic', 'obscene',
              'threat', 'insult', 'identity_hate']


print('Count of 1 per label: \n', df_train[cols_label].sum(), '\n') 
print('Count of 0 per label: \n', df_train[cols_label].eq(0).sum())

Count of 1 per label: 
 toxic            12202
severe_toxic      1282
obscene           6782
threat             379
insult            6292
identity_hate     1136
dtype: int64 

Count of 0 per label: 
 toxic            115454
severe_toxic     126374
obscene          120874
threat           127277
insult           121364
identity_hate    126520
dtype: int64


df_train[cols_label].sum().plot.bar(title='Count of 1s',color='tomato');


df_train[cols_label].eq(0).sum().plot.bar(title='Count of 0s');


# shuffle data
df_train = df_train.sample(frac=1,random_state=SEED).reset_index(drop=True)


from bert_serving.client import BertClient


!nohup bert-serving-start -model_dir=./uncased_L-12_H-768_A-12 -max_seq_len 50 > out.file 2>&1 &


# Start the BERT client
bc = BertClient()


# bc.encode(['this is example text'])


show_methods(bc)


print(bc.status)
print()
print(bc.server_status)

{'identity': b'8ca80a19-50b8-41e8-be2a-ad7f6f1445f7', 'num_request': 2, 'num_pending_request': 0, 'pending_request': set(), 'output_fmt': 'ndarray', 'port': 5555, 'port_out': 5556, 'server_ip': 'localhost', 'client_version': '1.10.0', 'timeout': -1}

{'client': '8ca80a19-50b8-41e8-be2a-ad7f6f1445f7', 'num_process': 2, 'ventilator -> worker': ['ipc://tmp1bOvpn/socket', 'ipc://tmplEGd7O/socket', 'ipc://tmpuuyWOg/socket', 'ipc://tmpX9rGwI/socket', 'ipc://tmplSWrea/socket', 'ipc://tmpCJqeWB/socket', 'ipc://tmplCn3D3/socket', 'ipc://tmpJHhTlv/socket'], 'worker -> sink': 'ipc://tmpJshcaq/socket', 'ventilator <-> sink': 'ipc://tmpZS0OHV/socket', 'server_current_time': '2020-12-04 18:34:33.814721', 'device_map': [], 'num_concurrent_socket': 8, 'statistic': {'num_data_request': 1, 'num_total_seq': 1, 'num_sys_request': 2, 'num_total_request': 3, 'num_total_client': 1, 'num_active_client': 1, 'avg_request_per_client': 3.0, 'min_request_per_client': 3, 'max_request_per_client': 3, 'num_min_request_per_client': 1, 'num_max_request_per_client': 1, 'avg_size_per_request': 1.0, 'min_size_per_request': 1, 'max_size_per_request': 1, 'num_min_size_per_request': 1, 'num_max_size_per_request': 1, 'avg_last_two_interval': 14.996989809999988, 'min_last_two_interval': 14.996989809999988, 'max_last_two_interval': 14.996989809999988, 'num_min_last_two_interval': 1, 'num_max_last_two_interval': 1, 'avg_request_per_second': 0.06668004797424082, 'min_request_per_second': 0.06668004797424082, 'max_request_per_second': 0.06668004797424082, 'num_min_request_per_second': 1, 'num_max_request_per_second': 1}, 'ckpt_name': 'bert_model.ckpt', 'config_name': 'bert_config.json', 'cors': '*', 'cpu': False, 'do_lower_case': True, 'fixed_embed_length': False, 'fp16': False, 'gpu_memory_fraction': 0.5, 'graph_tmp_dir': None, 'http_max_connect': 10, 'http_port': None, 'mask_cls_sep': False, 'max_batch_size': 256, 'max_seq_len': 50, 'model_dir': './uncased_L-12_H-768_A-12', 'no_position_embeddings': False, 'no_special_token': False, 'num_worker': 1, 'pooling_layer': [-2], 'pooling_strategy': 2, 'port': 5555, 'port_out': 5556, 'prefetch_size': 10, 'priority_batch_size': 16, 'show_tokens_to_client': False, 'tuned_model_dir': None, 'verbose': False, 'xla': False, 'tensorflow_version': ['1', '15', '2'], 'python_version': '3.6.9 (default, Oct  8 2020, 12:12:24) \n[GCC 8.4.0]', 'server_version': '1.10.0', 'pyzmq_version': '20.0.0', 'zmq_version': '4.3.3', 'server_start_time': '2020-12-04 18:34:11.527373'}


train_text = df_train[col_txt].tolist()
test_text = df_test[col_txt].tolist()


%%time
train_tokenize = bc.encode(train_text)

/usr/local/lib/python3.6/dist-packages/bert_serving/client/__init__.py:299: UserWarning: some of your sentences have more tokens than "max_seq_len=50" set on the server, as consequence you may get less-accurate or truncated embeddings.
here is what you can do:
- disable the length-check by create a new "BertClient(check_length=False)" when you do not want to display this warning
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)

CPU times: user 498 ms, sys: 497 ms, total: 994 ms
Wall time: 6min 37s


%%time
test_tokenize = bc.encode(test_text)

/usr/local/lib/python3.6/dist-packages/bert_serving/client/__init__.py:299: UserWarning: some of your sentences have more tokens than "max_seq_len=50" set on the server, as consequence you may get less-accurate or truncated embeddings.
here is what you can do:
- disable the length-check by create a new "BertClient(check_length=False)" when you do not want to display this warning
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)

CPU times: user 125 ms, sys: 82.4 ms, total: 207 ms
Wall time: 1min 41s


df_train.shape, np.array(train_tokenize).shape

((127656, 8), (127656, 768))


%%time
class_names = ['toxic', 'severe_toxic', 'obscene',
               'threat', 'insult', 'identity_hate']

df_probs = pd.DataFrame(columns=class_names)
params_xgb = dict(max_depth=5, random_state=SEED,tree_method=TREE_METHOD)

for class_name in class_names:
    train_target = df_train[class_name]
    clf = xgb.XGBClassifier(**params_xgb)
    clf.fit(train_tokenize, train_target)

    yprob = clf.predict_proba(test_tokenize)[:,1]
    df_probs[class_name] = yprob

CPU times: user 26.7 s, sys: 21.6 s, total: 48.3 s
Wall time: 48.3 s


df_probs.head()


threshold = 0.5
df_preds = df_probs.gt(threshold).astype(np.int8)
df_preds.head()


from sklearn import metrics as skmetrics


f1= skmetrics.f1_score(df_test[class_names], df_preds,average='micro')
acc = skmetrics.accuracy_score(df_test[class_names], df_preds)

print(f'F1-score (micro)  : {f1:.4f}')
print(f'Accuracy (overall): {acc:.4f}')

F1-score (micro)  : 0.6201
Accuracy (overall): 0.9080


from sklearn.metrics import multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(df_test[class_names], df_preds)
mcm

array([[[28415,   408],
        [ 1315,  1777]],

       [[31529,    73],
        [  236,    77]],

       [[30022,   226],
        [  753,   914]],

       [[31805,    11],
        [   85,    14]],

       [[30091,   239],
        [  805,   780]],

       [[31624,    22],
        [  234,    35]]])


clf_report = skmetrics.classification_report(
    df_test[class_names], df_preds,
    target_names=class_names)

print(clf_report)

               precision    recall  f1-score   support

        toxic       0.81      0.57      0.67      3092
 severe_toxic       0.51      0.25      0.33       313
      obscene       0.80      0.55      0.65      1667
       threat       0.56      0.14      0.23        99
       insult       0.77      0.49      0.60      1585
identity_hate       0.61      0.13      0.21       269

    micro avg       0.79      0.51      0.62      7025
    macro avg       0.68      0.36      0.45      7025
 weighted avg       0.78      0.51      0.61      7025
  samples avg       0.05      0.05      0.05      7025

/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


coo = df_test[class_names].T.dot(df_preds)
df_coo = pd.DataFrame(coo, columns=class_names,index=class_names)
df_coo.style.background_gradient()


df_coo2 = df_coo.copy()
df_coo2['Total'] = df_coo2.sum(axis=1)
df_coo2.loc[len(df_coo2),:] = df_coo2.sum(axis=0)
df_coo2.index = df_coo.index.tolist() + ['Total']
df_coo2 = df_coo2.astype(int)

# horizontal is true, vertical is predicted
df_coo2


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

# Time taken to run whole notebook: 1 hr 10 min 31 secs

	id	comment_text
0	8d603d50affa1126	"\nYes, aside, thank you for trying to answer ...
1	8fb3576937b9e0d0	March 2010 (UTC)\n\nThanks! and understood abo...
2	379440e04fb68e27	"\n\n The Outfield \n\nHahaha - compassion is ...
3	6be4446aac8ae028	Opposition is a source of strength. I believe ...
4	1a2ff7ed958506a3	Please discontinue making those unsupported ch...

	id	comment_text
0	70bbc3e96dd459b1	Hammed it is, cheers!
1	0b2e86f819b4b9a4	Not a problem, sorry for the inconvenience and...
2	fb7a63a8e287b2d1	Sources for Gambia at the 2000 Summer Olympics...
3	72beff75685cb8dc	Added some criticism \n\nI added some much nee...
4	f7c526a05d03f697	Who cares? It's just a song, it's a great song...

	toxic	severe_toxic	obscene	threat	insult	identity_hate
0	0.079790	0.003754	0.030732	0.000305	0.022670	0.001376
1	0.003160	0.000180	0.003161	0.000247	0.001889	0.000138
2	0.002050	0.000072	0.001411	0.000097	0.001221	0.000133
3	0.022417	0.000994	0.007981	0.000203	0.006146	0.000409
4	0.081312	0.001272	0.041599	0.000475	0.015619	0.003845

Description¶

Load the Libraries¶

Useful Functions¶

GPU Testing¶

Load Training Data¶

Data Processing: Training Data¶

Shuffle and create ohe column¶

Tokenize using Bert Client¶

Modelling: Xgboost¶

Model Evaluation¶

Confusion Matrix¶

Classification Report¶

Co-occurence Matrix¶

Time Taken¶

	0	1	2	3
0	close	formatter	pending_response	server_config
1	context	identity	port	server_status
2	encode	ip	port_out	status
3	encode_async	length_limit	receiver	timeout
4	fetch	output_fmt	request_id	token_info_available
5	fetch_all	pending_request	sender

	toxic	severe_toxic	obscene	threat	insult	identity_hate
toxic	1777	150	1075	23	975	54
severe_toxic	291	77	268	10	253	19
obscene	1195	145	914	15	800	43
threat	77	12	50	14	51	1
insult	1142	135	813	13	780	45
identity_hate	199	25	143	3	141	35