In this project, we use the data from kaggle competition Toxic Comment Classification Challenge by Jigsaw and only use the training data. Then we have break this raw training data into train and test data and evaluate the model performances in test data.
The dataset is taken from wikipedia edit text and is classified as one of the following:
This is a multi-label (not-multiclass) classification. One text row has six labels and exactly one label is 1 and other labels are 0.
Keras Modelling Resources
import os
import sys
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install -U sklearn
!pip install watermark
!pip install tqdm
!pip install scikit-plot
## mount google drive
from google.colab import drive
drive.mount('/content/drive')
## load the data dir
## dat_dir is train/test data from github
colab_dat_dir = 'drive/MyDrive/Colab Notebooks/data/'
sys.path.append(colab_dat_dir)
## Image dir
colab_img_dir = 'drive/MyDrive/Colab Notebooks/images/'
if not os.path.isdir(colab_img_dir): os.makedirs(colab_img_dir)
sys.path.append(colab_img_dir)
## Output dir
colab_out_dir = 'drive/MyDrive/Colab Notebooks/outputs/'
if not os.path.isdir(colab_out_dir): os.makedirs(colab_out_dir)
sys.path.append(colab_out_dir)
# data science
import numpy as np
import pandas as pd
from tqdm import tqdm
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
import joblib
# random state
SEED=100
np.random.seed(SEED)
# machine learning
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# deep learning
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.layers import Input, Dense, GRU, Dropout
from keras.layers import Bidirectional, Embedding, SpatialDropout1D, concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping, ModelCheckpoint
# nlp
import nltk
import gensim.models as gsm
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from nltk.corpus import words as nltk_words
# model eval
import scikitplot as skplt
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-12-01 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit joblib 0.17.0 keras 2.4.3 nltk 3.4.4 sklearn 0.23.2 numpy 1.19.4 matplotlib 3.2.1 seaborn 0.10.1 pandas 1.1.1 watermark 2.0.2 scikitplot 0.3.7
def show_methods(obj, ncols=4):
lst = [i for i in dir(obj) if i[0]!='_' ]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
# data
dat_dir = os.path.join('..','data')
path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'
if ENV_COLAB:
dat_dir = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/Jigsaw_Toxic_Comment_Classification/"
r = '?raw=true'
path_data_train = dat_dir + 'train.csv.zip' + r
path_data_test = dat_dir + 'test.csv.zip' + r
compression = 'zip'
df_train = pd.read_csv(path_data_train,compression=compression)
df_test = pd.read_csv(path_data_test,compression=compression)
print(f"""
df_train : {df_train.shape}
df_test : {df_test.shape}
Features: {df_train.columns.tolist()}
""")
display(df_train.head(2).append(df_train.tail(2)))
df_train : (127656, 8) df_test : (31915, 8) Features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | |
---|---|---|---|---|---|---|---|---|
0 | 8d603d50affa1126 | "\nYes, aside, thank you for trying to answer ... | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 8fb3576937b9e0d0 | March 2010 (UTC)\n\nThanks! and understood abo... | 0 | 0 | 0 | 0 | 0 | 0 |
127654 | 95df37d4a69b607d | I am assuming that there is no point trying to... | 0 | 0 | 0 | 0 | 0 | 0 |
127655 | 668ba87c1b6a3f31 | "\nPlus, take a look! Have I made any outing ... | 0 | 0 | 0 | 0 | 0 | 0 |
# debug
# df_train = df_train.sample(n=50_000,random_state=SEED)
dir_embed_file = os.path.join(os.path.expanduser('~'),'Datasets','NLP')
if ENV_COLAB:
dir_embed_file = colab_dat_dir + 'NLP'
print(dir_embed_file)
path_embed_file = os.path.join(dir_embed_file,'crawl-300d-2M.vec')
print(path_embed_file)
/Users/poudel/Datasets/NLP/crawl-300d-2M.vec
!ls "$dir_embed_file"
crawl-300d-2M.vec crawl-300d-2M.vec.zip
def get_coefs(word, *arr):
return word, np.asarray(arr, dtype='float32')
def standardize_text(df, text_field):
"""
Use a few regular expressions to clean up pour data.
"""
df[text_field] = df[text_field].str.replace(r"http\S+", "")
df[text_field] = df[text_field].str.replace(r"http", "")
df[text_field] = df[text_field].str.replace(r"@\S+", "")
df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
df[text_field] = df[text_field].str.replace(r"@", "at")
df[text_field] = df[text_field].str.lower()
return df
def similarity(s1, s2):
"""
Find Similarity between two strings.
Return a measure of the sequences' similarity (float in [0,1]).
"""
return SequenceMatcher(None, s1, s2).ratio()
def is_english_word(dictionary, word):
"""
Check if a word is an English word.
"""
try:
x = dictionary[word]
return True
except KeyError:
return False
def normalize_bad_word(word_list, dictionary, bad_words, threshold):
"""
Return a list of normalized words.
"""
res = []
for word in word_list:
found = False
normalizedBadWord = ""
for badword in bad_words:
#if(similarity(badword, word) > threshold):
if(badword in word):
found = True
normalizedBadWord = badword
break;
if(found):
res.append(normalizedBadWord)
else:
if(is_english_word(dictionary, word)):
res.append(word)
#print(res)
return res
%%time
# TRAIN
processing = """
1. fill nans by _NA_
2. standardize text
(r"http\S+", "")
(r"http", "")
(r"@\S+", "")
(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
(r"@", "at")
lower()
3. tokenize (RegexpTokenizer(r'[a-zA-Z]+'))
4. remove stopwords
5. normalize bad words
6. save clean file
7. combine tokens by " "
"""
def create_tokens(dfx,col_txt,
tokenizer,
stop_words,
dictionary,
bad_words,
print_=True
):
dfx = dfx.copy()
# fill nans
dfx[col_txt] = dfx[col_txt].fillna('_NA_')
# standardize
dfx = standardize_text(dfx, col_txt)
# create tokens column
dfx["tokens"] = dfx[col_txt].apply(tokenizer.tokenize)
# remove stopwords
dfx["tokens"] = dfx["tokens"].apply(
lambda vec: [word for word in vec if word not in stop_words])
# normalize bad words
dfx["tokens"] = dfx["tokens"].apply(
lambda vec: normalize_bad_word(vec, dictionary, bad_words, 0.5))
# print info
if print_:
all_words = [word for tokens in dfx["tokens"] for word in tokens]
sent_lengths = [len(tokens) for tokens in dfx["tokens"]]
VOCAB = sorted(list(set(all_words)))
print(f"""
Total words : {len(all_words):,}
Vocab size : {len(VOCAB):,}
Len longest sent: {max(sent_lengths)}
Mean sent length: {np.mean(sent_lengths):.2f}
""")
# join words by space
dfx["tokens"] = dfx["tokens"].apply(lambda vec :' '.join(vec))
return dfx
CPU times: user 3 µs, sys: 0 ns, total: 3 µs Wall time: 4.05 µs
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] /Users/poudel/nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
%%time
print('Preparing Dictionary...')
# Read the FastText word vectors (space delimited strings) into a dictionary from word->vector
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(path_embed_file, encoding="utf8"))
print("embeddings_index size: ", len(embeddings_index))
dictionary = dict.fromkeys(embeddings_index, None)
print("Dictionary size: ", len(dictionary))
# Wall time: 1min 49s
Preparing Dictionary... embeddings_index size: 2000000 Dictionary size: 2000000 CPU times: user 1min 49s, sys: 3.89 s, total: 1min 53s Wall time: 1min 59s
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
bad_words = ['sex', 'suck', 'anal', 'penis', 'shit', 'fuck', 'damn',
'bitch', 'crap', 'piss', 'dick', 'darn', 'cock', 'pussy','ass',
'asshole', 'fag', 'bastard', 'slut', 'douche','bastard', 'darn',
'bloody', 'bugger', 'bollocks', 'arsehole','nigger', 'nigga',
'moron', 'gay', 'antisemitism', 'anti','nazi', 'poop']
print("num bad words: ", len(bad_words))
num bad words: 34
# This will create new column "tokens"
df_train = create_tokens(df_train,
col_txt='comment_text',
tokenizer=tokenizer,
stop_words=stop_words,
dictionary=dictionary,
bad_words=bad_words,
print_=True
)
df_test = create_tokens(df_test,
col_txt='comment_text',
tokenizer=tokenizer,
stop_words=stop_words,
dictionary=dictionary,
bad_words=bad_words,
print_=True
)
df_train.head()
Total words : 4,279,556 Vocab size : 86,638 Len longest sent: 1250 Mean sent length: 33.52 Total words : 1,044,479 Vocab size : 46,933 Len longest sent: 1250 Mean sent length: 32.73
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | tokens | |
---|---|---|---|---|---|---|---|---|---|
0 | 8d603d50affa1126 | "\nyes, aside, thank you for trying to answer ... | 0 | 0 | 0 | 0 | 0 | 0 | yes aside thank trying answer block related co... |
1 | 8fb3576937b9e0d0 | march 2010 (utc)\n\nthanks! and understood abo... | 0 | 0 | 0 | 0 | 0 | 0 | march utc thanks understood tags advice better... |
2 | 379440e04fb68e27 | "\n\n the outfield \n\nhahaha compassion is ... | 0 | 0 | 0 | 0 | 0 | 0 | outfield hahaha ass vested considering er kind... |
3 | 6be4446aac8ae028 | opposition is a source of strength i believe ... | 0 | 0 | 0 | 0 | 0 | 0 | opposition source strength believe al said |
4 | 1a2ff7ed958506a3 | please discontinue making those unsupported ch... | 0 | 0 | 0 | 0 | 0 | 0 | please discontinue making unsupported changes ... |
max_features = 80_000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 150 # max number of words in a comment to use
embed_size = 300 # how big is each word vector
%%time
from keras.preprocessing.text import Tokenizer
# Turn each comment into a list of word indexes
# of equal length (with truncation or padding as needed)
list_classes = ["toxic", "severe_toxic", "obscene",
"threat", "insult", "identity_hate"]
list_sentences_train = list(df_train["tokens"].to_numpy())
list_sentences_test = list(df_test["tokens"].to_numpy())
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train)
+ list(list_sentences_test))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
Xtrain_text = pad_sequences(list_tokenized_train, maxlen=maxlen)
Xtest_text = pad_sequences(list_tokenized_test, maxlen=maxlen)
CPU times: user 8.45 s, sys: 74.9 ms, total: 8.53 s Wall time: 9.12 s
%%time
path_missing_words = '../outputs/word_not_found.csv'
# BUILD EMBEDDING MATRIX
print('Preparing embedding matrix...')
word_index = tokenizer.word_index
print("word_index size: ", len(word_index))
words_not_found = []
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
if (embedding_vector is not None) and len(embedding_vector) > 0 :
embedding_matrix[i-1] = embedding_vector
else:
words_not_found.append(word)
if(len(words_not_found)>0):
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
print("sample words not found: ", np.random.choice(words_not_found, 10))
df_missing_words = pd.DataFrame(words_not_found)
df_missing_words.to_csv(path_missing_words, header=None, index=False)
# prints
print(f'Number of words: {nb_words}')
Preparing embedding matrix... word_index size: 94298 Number of words: 80000 CPU times: user 278 ms, sys: 378 ms, total: 656 ms Wall time: 708 ms
dict(list(word_index.items())[0: 3])
# number starts with 1 not 0, so we use,
# embedding_matrix[i-1] = embedding_vector
{'article': 1, 'page': 2, 'wikipedia': 3}
dict(list(word_index.items())[-3:])
{'piezo': 94296, 'abouta': 94297, 'andhave': 94298}
len(word_index), embedding_matrix.shape
(94298, (80000, 300))
from keras.callbacks import Callback
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
# we must have variable names X_val and y_val
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
score = roc_auc_score(self.y_val, y_pred)
print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
def get_model(maxlen, max_features, embed_size, embedding_matrix,
lr=0.0, lr_d=0.0, units=0, dr=0.0):
"""Get the tensorflow model.
References:
- http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
- For text, CNN -> LSTM (or GRU) doesn't seem to work well, but LSTM -> CNN works really well.
"""
inp = Input(shape=(maxlen,))
# Ref: https://keras.io/api/layers/core_layers/embedding/
# embedding needs input_dim = maximum integer index + 1.
x = Embedding(max_features, embed_size,
weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(units, return_sequences=True, dropout=dr, recurrent_dropout=dr))(x)
x = Conv1D(filters=64, kernel_size=2, padding='valid', kernel_initializer="he_uniform")(x)
x = Dropout(dr)(x)
# x = MaxPooling1D(pool_size=2)(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
outp = Dense(6, activation="sigmoid")(conc)
model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy',
optimizer=Adam(lr=lr, decay=lr_d),
metrics=['accuracy'])
return model
model = get_model(maxlen,nb_words,embed_size,embedding_matrix,
lr=1e-3,lr_d=0,units=128, dr=0.5)
model.summary()
Model: "functional_1" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) [(None, 150)] 0 __________________________________________________________________________________________________ embedding (Embedding) (None, 150, 300) 24000000 input_1[0][0] __________________________________________________________________________________________________ spatial_dropout1d (SpatialDropo (None, 150, 300) 0 embedding[0][0] __________________________________________________________________________________________________ bidirectional (Bidirectional) (None, 150, 256) 330240 spatial_dropout1d[0][0] __________________________________________________________________________________________________ conv1d (Conv1D) (None, 149, 64) 32832 bidirectional[0][0] __________________________________________________________________________________________________ dropout (Dropout) (None, 149, 64) 0 conv1d[0][0] __________________________________________________________________________________________________ global_average_pooling1d (Globa (None, 64) 0 dropout[0][0] __________________________________________________________________________________________________ global_max_pooling1d (GlobalMax (None, 64) 0 dropout[0][0] __________________________________________________________________________________________________ concatenate (Concatenate) (None, 128) 0 global_average_pooling1d[0][0] global_max_pooling1d[0][0] __________________________________________________________________________________________________ dense (Dense) (None, 6) 774 concatenate[0][0] ================================================================================================== Total params: 24,363,846 Trainable params: 24,363,846 Non-trainable params: 0 __________________________________________________________________________________________________
Xtrain = Xtrain_text
Xtest = Xtest_text
list_classes = ["toxic", "severe_toxic", "obscene",
"threat", "insult", "identity_hate"]
ytrain = df_train[list_classes].to_numpy()
ytest = df_test[list_classes].to_numpy()
print(f"""
df_train :{df_train.shape} df_test:{df_test.shape}
Xtrain :{Xtrain.shape} Xtest :{Xtest.shape}
ytrain :{ytrain.shape} ytest :{ytest.shape}
""")
df_train :(127656, 9) df_test:(31915, 9) Xtrain :(127656, 150) Xtest :(31915, 150) ytrain :(127656, 6) ytest :(31915, 6)
%%time
batch_size = 128
epochs = 2
interval=1
path_model_checkpoint = '../outputs/keras_gru_fasttext_badwords.h5'
X_tra, X_val, y_tra, y_val = train_test_split(Xtrain, ytrain,
test_size=0.1, random_state=SEED)
validation_data=(X_val, y_val) # names must be X_val and y_val
# as used in class RocAucEvaluation
# callbacks
cb_rocauc = RocAucEvaluation(
validation_data=validation_data,
interval=interval)
cb_early = EarlyStopping(
monitor='val_acc',
patience=5,
mode='max',
verbose=1)
cb_check = ModelCheckpoint(
path_model_checkpoint,
monitor='val_acc',
save_best_only=True,
mode='max',
verbose=0)
callbacks = [cb_rocauc, cb_early, cb_check]
path_keras_model = '../outputs/keras_fasttext_model.h5'
DO_TRAIN = False
if DO_TRAIN:
history = model.fit(X_tra, y_tra,
batch_size=batch_size,
epochs=epochs,
validation_data=validation_data,
callbacks=callbacks,
verbose=2,
shuffle=True
)
model.save(path_keras_model)
# Wall time: Wall time: 20min 18s
CPU times: user 40.9 ms, sys: 5.86 ms, total: 46.8 ms Wall time: 75.2 ms
!du -sh $path_keras_model
280M ../outputs/keras_fasttext_model.h5
%%time
path_yprobs_keras = '../outputs/yprobs_keras_fasttext.npz'
if DO_TRAIN:
model = keras.models.load_model(path_keras_model)
yprobs = model.predict(Xtest_text,batch_size=1024,verbose=2)
np.savez_compressed(path_yprobs_keras, yprobs=yprobs)
# Wall time: 1min 33s
CPU times: user 3 µs, sys: 1 µs, total: 4 µs Wall time: 6.2 µs
yprobs = np.load(path_yprobs_keras)['yprobs']
yprobs[:5]
array([[0.01838818, 0.00341764, 0.02119815, 0.00906962, 0.01449135, 0.00419179], [0.01054636, 0.00153401, 0.01218945, 0.00453353, 0.00753298, 0.00132418], [0.00564146, 0.00059021, 0.00540674, 0.00278851, 0.00426516, 0.00168946], [0.00870907, 0.00088844, 0.00853044, 0.00390193, 0.0061962 , 0.0016318 ], [0.01302436, 0.00158405, 0.00876519, 0.00363001, 0.00854704, 0.00241932]], dtype=float32)
labels = ['toxic','severe_toxic','obscene',
'threat','insult','identity_hate']
ypreds = (yprobs==yprobs.max(axis=1,keepdims=True)).astype(np.int8)
ypreds[:5]
array([[0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]], dtype=int8)
ytest.shape, yprobs.shape, ypreds.shape
((31915, 6), (31915, 6), (31915, 6))
from sklearn.metrics import multilabel_confusion_matrix
mcm = multilabel_confusion_matrix(ytest, ypreds)
mcm
array([[[ 6919, 21904], [ 1211, 1881]], [[31602, 0], [ 313, 0]], [[23886, 6362], [ 707, 960]], [[31804, 12], [ 99, 0]], [[30234, 96], [ 1585, 0]], [[30950, 696], [ 264, 5]]])
ytest.shape[0], mcm.sum(axis=1).sum(axis=1)
(31915, array([31915, 31915, 31915, 31915, 31915, 31915]))
for i in range(6):
skplt.metrics.plot_confusion_matrix(ytest[:,i],ypreds[:,i],title=labels[i])
r = sklearn.metrics.classification_report(ytest,ypreds)
print(r)
precision recall f1-score support 0 0.08 0.61 0.14 3092 1 0.00 0.00 0.00 313 2 0.13 0.58 0.21 1667 3 0.00 0.00 0.00 99 4 0.00 0.00 0.00 1585 5 0.01 0.02 0.01 269 micro avg 0.09 0.41 0.15 7025 macro avg 0.04 0.20 0.06 7025 weighted avg 0.07 0.41 0.11 7025 samples avg 0.09 0.05 0.06 7025
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
# check the evaluation for toxic (0th label)
df_toxic = pd.DataFrame({'ytest': ytest[:,0], 'ypred': ypreds[:,0]})
df_toxic.query('ytest == 1').head(4)
ytest | ypred | |
---|---|---|
13 | 1 | 0 |
22 | 1 | 1 |
31 | 1 | 1 |
32 | 1 | 0 |
pd.crosstab(df_toxic['ytest'],df_toxic['ypred'])
ypred | 0 | 1 |
---|---|---|
ytest | ||
0 | 6919 | 21904 |
1 | 1211 | 1881 |
pre0 = sklearn.metrics.precision_score(ytest[:,0],ypreds[:,0])
pre1 = sklearn.metrics.precision_score(ytest[:,1],ypreds[:,1])
pre2 = sklearn.metrics.precision_score(ytest[:,2],ypreds[:,2])
pre0, pre1, pre2
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
(0.07908345595963842, 0.0, 0.13111171810980607)
coo = ytest.T.dot(ypreds)
coo
array([[1881, 0, 1193, 2, 2, 14], [ 138, 0, 174, 0, 0, 1], [ 703, 0, 960, 0, 0, 4], [ 76, 0, 21, 0, 0, 2], [ 894, 0, 688, 0, 0, 3], [ 208, 0, 56, 0, 0, 5]])
df_coo = pd.DataFrame(coo, columns=labels,index=labels)
df_coo.to_csv('../outputs/keras_confusion_matrix.csv',index=False)
df_coo.style.background_gradient()
toxic | severe_toxic | obscene | threat | insult | identity_hate | |
---|---|---|---|---|---|---|
toxic | 1881 | 0 | 1193 | 2 | 2 | 14 |
severe_toxic | 138 | 0 | 174 | 0 | 0 | 1 |
obscene | 703 | 0 | 960 | 0 | 0 | 4 |
threat | 76 | 0 | 21 | 0 | 0 | 2 |
insult | 894 | 0 | 688 | 0 | 0 | 3 |
identity_hate | 208 | 0 | 56 | 0 | 0 | 5 |
df_coo2 = df_coo.copy()
df_coo2['Total'] = df_coo2.sum(axis=1)
df_coo2.loc[len(df_coo2),:] = df_coo2.sum(axis=0)
df_coo2.index = df_coo.index.tolist() + ['Total']
df_coo2 = df_coo2.astype(int)
# horizontal is true, vertical is predicted
df_coo2
toxic | severe_toxic | obscene | threat | insult | identity_hate | Total | |
---|---|---|---|---|---|---|---|
toxic | 1881 | 0 | 1193 | 2 | 2 | 14 | 3092 |
severe_toxic | 138 | 0 | 174 | 0 | 0 | 1 | 313 |
obscene | 703 | 0 | 960 | 0 | 0 | 4 | 1667 |
threat | 76 | 0 | 21 | 0 | 0 | 2 | 99 |
insult | 894 | 0 | 688 | 0 | 0 | 3 | 1585 |
identity_hate | 208 | 0 | 56 | 0 | 0 | 5 | 269 |
Total | 3900 | 0 | 3092 | 2 | 2 | 29 | 7025 |
from util_ds import highlight_rcd
highlight_rcd(df_coo2)
toxic | severe_toxic | obscene | threat | insult | identity_hate | Total | |
---|---|---|---|---|---|---|---|
toxic | 1881 | 0 | 1193 | 2 | 2 | 14 | 3092 |
severe_toxic | 138 | 0 | 174 | 0 | 0 | 1 | 313 |
obscene | 703 | 0 | 960 | 0 | 0 | 4 | 1667 |
threat | 76 | 0 | 21 | 0 | 0 | 2 | 99 |
insult | 894 | 0 | 688 | 0 | 0 | 3 | 1585 |
identity_hate | 208 | 0 | 56 | 0 | 0 | 5 | 269 |
Total | 3900 | 0 | 3092 | 2 | 2 | 29 | 7025 |
from util_multilabel import plot_confusion_matrix
plot_confusion_matrix(coo, target_names=labels,
cmap='Reds',normalize=False)
from util_plotly import plot_coo_matrix
plot_coo_matrix(labels,coo)
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning: plotly.graph_objs.Margin is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.layout.Margin
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 2 min 41 secs