In this project, we use the data from kaggle competition Toxic Comment Classification Challenge by Jigsaw and only use the training data. Then we have break this raw training data into train and test data and evaluate the model performances in test data.
The dataset is taken from wikipedia edit text and is classified as one of the following:
This is a multi-label (not-multiclass) classification. One text row has six labels and exactly one label is 1 and other labels are 0.
import os
import sys
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install -U sklearn
!pip install watermark
!pip install tqdm
!pip install scikit-plot
import numpy as np
import pandas as pd
from tqdm import tqdm
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
import joblib
# random state
SEED=100
np.random.seed(SEED)
# machine learning
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# deep learning
import tensorflow
import keras
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
# model eval
import scikitplot as skplt
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-12-01 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit seaborn 0.10.1 tensorflow 2.3.0 watermark 2.0.2 matplotlib 3.2.1 pandas 1.1.1 scikitplot 0.3.7 sklearn 0.23.2 keras 2.4.3 joblib 0.17.0 numpy 1.19.4
def show_methods(obj, ncols=4):
lst = [i for i in dir(obj) if i[0]!='_' ]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
# data
dat_dir = os.path.join('..','data')
path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'
if ENV_COLAB:
dat_dir = os.path.join('..','data')
r = '?raw=true'
path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'
df_train = pd.read_csv(path_data_train,compression=compression)
df_test = pd.read_csv(path_data_test,compression=compression)
print(df_train.shape)
print(df_train.columns)
display(df_train.head(2).append(df_train.tail(2)))
(127656, 8) Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], dtype='object')
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | |
---|---|---|---|---|---|---|---|---|
0 | 8d603d50affa1126 | "\nYes, aside, thank you for trying to answer ... | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 8fb3576937b9e0d0 | March 2010 (UTC)\n\nThanks! and understood abo... | 0 | 0 | 0 | 0 | 0 | 0 |
127654 | 95df37d4a69b607d | I am assuming that there is no point trying to... | 0 | 0 | 0 | 0 | 0 | 0 |
127655 | 668ba87c1b6a3f31 | "\nPlus, take a look! Have I made any outing ... | 0 | 0 | 0 | 0 | 0 | 0 |
dir_embed_file = os.path.join(os.path.expanduser('~'),'Datasets','NLP')
path_embed_file = os.path.join(dir_embed_file,'crawl-300d-2M.vec')
print(path_embed_file)
/Users/poudel/Datasets/NLP/crawl-300d-2M.vec
maincol = 'comment_text'
targets = ["toxic", "severe_toxic", "obscene", "threat",
"insult", "identity_hate"]
Xtrain = df_train[maincol].to_numpy()
ytrain = df_train[targets].to_numpy()
Xtest = df_test[maincol].to_numpy()
ytest = df_test[targets].to_numpy()
max_features = 30_000
maxlen = 100
embed_size = 300
%%time
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(Xtrain) + list(Xtest))
Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtest = tokenizer.texts_to_sequences(Xtest)
Xtrain = sequence.pad_sequences(Xtrain, maxlen=maxlen)
Xtest = sequence.pad_sequences(Xtest, maxlen=maxlen)
CPU times: user 23.6 s, sys: 189 ms, total: 23.8 s Wall time: 25.4 s
def get_coefs(word, *arr):
return word, np.asarray(arr, dtype='float32')
%%time
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' '))
for o in open(path_embed_file))
word_index = tokenizer.word_index
num_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in tqdm(word_index.items()):
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# Wall time: 1min 46s
100%|██████████| 210337/210337 [00:00<00:00, 436136.92it/s]
CPU times: user 2min 54s, sys: 8.18 s, total: 3min 2s Wall time: 3min 38s
from keras.callbacks import Callback
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
score = roc_auc_score(self.y_val, y_pred)
print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
def get_model(maxlen, max_features, embed_size,embedding_matrix):
inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size,
weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(80, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
outp = Dense(6, activation="sigmoid")(conc)
model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
model = get_model(maxlen, max_features, embed_size,embedding_matrix)
model.summary()
Model: "functional_1" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) [(None, 100)] 0 __________________________________________________________________________________________________ embedding (Embedding) (None, 100, 300) 9000000 input_1[0][0] __________________________________________________________________________________________________ spatial_dropout1d (SpatialDropo (None, 100, 300) 0 embedding[0][0] __________________________________________________________________________________________________ bidirectional (Bidirectional) (None, 100, 160) 183360 spatial_dropout1d[0][0] __________________________________________________________________________________________________ global_average_pooling1d (Globa (None, 160) 0 bidirectional[0][0] __________________________________________________________________________________________________ global_max_pooling1d (GlobalMax (None, 160) 0 bidirectional[0][0] __________________________________________________________________________________________________ concatenate (Concatenate) (None, 320) 0 global_average_pooling1d[0][0] global_max_pooling1d[0][0] __________________________________________________________________________________________________ dense (Dense) (None, 6) 1926 concatenate[0][0] ================================================================================================== Total params: 9,185,286 Trainable params: 9,185,286 Non-trainable params: 0 __________________________________________________________________________________________________
%%time
batch_size = 32
epochs = 2
interval=1
X_tra, X_val, y_tra, y_val = train_test_split(Xtrain, ytrain,
test_size=0.1, random_state=SEED)
validation_data=(X_val, y_val) # names must be X_val and y_val
# as used in class RocAucEvaluation
# callbacks
cb_rocauc = RocAucEvaluation(
validation_data=(Xtest, ytest),
interval=interval)
path_keras_model = '../outputs/keras_model.h5'
DO_TRAIN = False
if DO_TRAIN:
history = model.fit(X_tra, y_tra,
batch_size=batch_size,
epochs=epochs,
validation_data=validation_data,
callbacks=[cb_rocauc],
verbose=2
)
model.save(path_keras_model)
# CPU times: user 1h 2min 2s, sys: 23min 57s, total: 1h 26min
# Wall time: 31min 44s
CPU times: user 47 ms, sys: 34.9 ms, total: 81.9 ms Wall time: 129 ms
# !du -sh $path_keras_model
# 120M ../outputs/keras_model.h5
%%time
path_yprobs_keras = '../outputs/yprobs_keras.npz'
if DO_TRAIN:
model = keras.models.load_model(path_keras_model)
yprobs = model.predict(Xtest,batch_size=1024,verbose=2)
np.savez_compressed(path_yprobs_keras, yprobs=yprobs)
# Wall time: 1min 33s
CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs Wall time: 8.82 µs
yprobs = np.load(path_yprobs_keras)['yprobs']
yprobs[:5]
array([[1.6023219e-03, 1.8933415e-04, 8.7299943e-04, 5.9655915e-05, 6.2906742e-04, 9.4687384e-05], [2.7573109e-04, 6.9994370e-05, 3.5297871e-04, 7.7578179e-06, 4.9650669e-04, 1.8649160e-05], [4.0832162e-04, 4.5213445e-05, 4.0152669e-04, 1.5648015e-05, 2.9772520e-04, 3.9751460e-05], [1.1048913e-03, 1.0009683e-04, 2.7105212e-04, 2.1384854e-05, 1.6108155e-04, 4.2530606e-05], [4.1490495e-03, 6.4332758e-05, 1.1196733e-03, 7.6643073e-06, 2.8944016e-04, 2.1070426e-05]], dtype=float32)
labels = ['toxic', 'severe_toxic', 'obscene', 'threat',
'insult', 'identity_hate']
ypreds = (yprobs==yprobs.max(axis=1,keepdims=True)).astype(np.int8)
ypreds[:5]
array([[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]], dtype=int8)
ytest.shape, yprobs.shape, ypreds.shape
((31915, 6), (31915, 6), (31915, 6))
from sklearn.metrics import multilabel_confusion_matrix
mcm = multilabel_confusion_matrix(ytest, ypreds)
mcm
array([[[ 3212, 25611], [ 66, 3026]], [[31602, 0], [ 313, 0]], [[27486, 2762], [ 1620, 47]], [[31809, 7], [ 98, 1]], [[30001, 329], [ 1584, 1]], [[31514, 132], [ 269, 0]]])
ytest.shape[0], mcm.sum(axis=1).sum(axis=1)
(31915, array([31915, 31915, 31915, 31915, 31915, 31915]))
for i in range(6):
skplt.metrics.plot_confusion_matrix(ytest[:,i],ypreds[:,i],title=labels[i])
r = sklearn.metrics.classification_report(ytest,ypreds)
print(r)
precision recall f1-score support 0 0.11 0.98 0.19 3092 1 0.00 0.00 0.00 313 2 0.02 0.03 0.02 1667 3 0.12 0.01 0.02 99 4 0.00 0.00 0.00 1585 5 0.00 0.00 0.00 269 micro avg 0.10 0.44 0.16 7025 macro avg 0.04 0.17 0.04 7025 weighted avg 0.05 0.44 0.09 7025 samples avg 0.10 0.06 0.07 7025
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
# check the evaluation for toxic (0th label)
df_toxic = pd.DataFrame({'ytest': ytest[:,0], 'ypred': ypreds[:,0]})
df_toxic.query('ytest == 1').head(4)
ytest | ypred | |
---|---|---|
13 | 1 | 1 |
22 | 1 | 1 |
31 | 1 | 1 |
32 | 1 | 1 |
pd.crosstab(df_toxic['ytest'],df_toxic['ypred'])
ypred | 0 | 1 |
---|---|---|
ytest | ||
0 | 3212 | 25611 |
1 | 66 | 3026 |
pre0 = sklearn.metrics.precision_score(ytest[:,0],ypreds[:,0])
pre1 = sklearn.metrics.precision_score(ytest[:,1],ypreds[:,1])
pre2 = sklearn.metrics.precision_score(ytest[:,2],ypreds[:,2])
pre0, pre1, pre2
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
(0.10566749310332786, 0.0, 0.01673193307226771)
coo = ytest.T.dot(ypreds)
coo
array([[3026, 0, 58, 2, 4, 2], [ 311, 0, 1, 1, 0, 0], [1618, 0, 47, 1, 1, 0], [ 98, 0, 0, 1, 0, 0], [1570, 0, 14, 0, 1, 0], [ 267, 0, 1, 0, 1, 0]])
df_coo = pd.DataFrame(coo, columns=labels,index=labels)
df_coo.to_csv('../outputs/keras_confusion_matrix.csv',index=False)
df_coo.style.background_gradient()
# only diagonal should be dark color
toxic | severe_toxic | obscene | threat | insult | identity_hate | |
---|---|---|---|---|---|---|
toxic | 3026 | 0 | 58 | 2 | 4 | 2 |
severe_toxic | 311 | 0 | 1 | 1 | 0 | 0 |
obscene | 1618 | 0 | 47 | 1 | 1 | 0 |
threat | 98 | 0 | 0 | 1 | 0 | 0 |
insult | 1570 | 0 | 14 | 0 | 1 | 0 |
identity_hate | 267 | 0 | 1 | 0 | 1 | 0 |
df_coo2 = df_coo.copy()
df_coo2['Total'] = df_coo2.sum(axis=1)
df_coo2.loc[len(df_coo2),:] = df_coo2.sum(axis=0)
df_coo2.index = df_coo.index.tolist() + ['Total']
df_coo2 = df_coo2.astype(int)
# horizontal is true, vertical is predicted
# df_coo2
toxic | severe_toxic | obscene | threat | insult | identity_hate | Total | |
---|---|---|---|---|---|---|---|
toxic | 3026 | 0 | 58 | 2 | 4 | 2 | 3092 |
severe_toxic | 311 | 0 | 1 | 1 | 0 | 0 | 313 |
obscene | 1618 | 0 | 47 | 1 | 1 | 0 | 1667 |
threat | 98 | 0 | 0 | 1 | 0 | 0 | 99 |
insult | 1570 | 0 | 14 | 0 | 1 | 0 | 1585 |
identity_hate | 267 | 0 | 1 | 0 | 1 | 0 | 269 |
Total | 6890 | 0 | 121 | 5 | 7 | 2 | 7025 |
from util_ds import highlight_rcd
highlight_rcd(df_coo2)
toxic | severe_toxic | obscene | threat | insult | identity_hate | Total | |
---|---|---|---|---|---|---|---|
toxic | 3026 | 0 | 58 | 2 | 4 | 2 | 3092 |
severe_toxic | 311 | 0 | 1 | 1 | 0 | 0 | 313 |
obscene | 1618 | 0 | 47 | 1 | 1 | 0 | 1667 |
threat | 98 | 0 | 0 | 1 | 0 | 0 | 99 |
insult | 1570 | 0 | 14 | 0 | 1 | 0 | 1585 |
identity_hate | 267 | 0 | 1 | 0 | 1 | 0 | 269 |
Total | 6890 | 0 | 121 | 5 | 7 | 2 | 7025 |
from util_multilabel import plot_confusion_matrix
plot_confusion_matrix(coo, target_names=labels,
cmap='Reds',normalize=False)
labels = ['toxic','severe_toxic','obscene',
'threat','insult','identity_hate']
from util_plotly import plot_coo_matrix
plot_coo_matrix(labels,coo)
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning: plotly.graph_objs.Margin is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.layout.Margin
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 4 min 17 secs