In this project, we use the data from kaggle competition Toxic Comment Classification Challenge by Jigsaw and only use the training data. Then we have break this raw training data into train and test data and evaluate the model performances in test data.
The dataset is taken from wikipedia edit text and is classified as one of the following:
This is a multi-label (not-multiclass) classification. One text row has six labels and exactly one label is 1 and other labels are 0.
Keras Modelling Resources
import os
import sys
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
    ## install modules
    !pip install -U sklearn
    !pip install watermark
    !pip install tqdm
    !pip install scikit-plot
    ## mount google drive
    from google.colab import drive
    drive.mount('/content/drive')
    ## load the data dir
    ## dat_dir is train/test data from github
    colab_dat_dir = 'drive/MyDrive/Colab Notebooks/data/'
    sys.path.append(colab_dat_dir)
    ## Image dir
    colab_img_dir = 'drive/MyDrive/Colab Notebooks/images/'
    if not os.path.isdir(colab_img_dir): os.makedirs(colab_img_dir)
    sys.path.append(colab_img_dir)
    ## Output dir
    colab_out_dir = 'drive/MyDrive/Colab Notebooks/outputs/'
    if not os.path.isdir(colab_out_dir): os.makedirs(colab_out_dir)
    sys.path.append(colab_out_dir)
# data science
import numpy as np
import pandas as pd
from tqdm import tqdm
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
import joblib
# random state
SEED=100
np.random.seed(SEED)
# machine learning
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# deep learning
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.layers import Input, Dense, GRU, Dropout
from keras.layers import Bidirectional, Embedding, SpatialDropout1D, concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping, ModelCheckpoint
# nlp
import nltk
import gensim.models as gsm
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from nltk.corpus import words as nltk_words
# model eval
import scikitplot as skplt
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-12-01 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit joblib 0.17.0 keras 2.4.3 nltk 3.4.4 sklearn 0.23.2 numpy 1.19.4 matplotlib 3.2.1 seaborn 0.10.1 pandas 1.1.1 watermark 2.0.2 scikitplot 0.3.7
def show_methods(obj, ncols=4):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df
# data
dat_dir = os.path.join('..','data')
path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'
if ENV_COLAB:
    dat_dir = "https://github.com/bhishanpdl/Datasets/blob/master/Projects/Jigsaw_Toxic_Comment_Classification/"
    r = '?raw=true'
    path_data_train = dat_dir + 'train.csv.zip' + r
    path_data_test = dat_dir + 'test.csv.zip' + r
    compression = 'zip'
df_train = pd.read_csv(path_data_train,compression=compression)
df_test = pd.read_csv(path_data_test,compression=compression)
print(f"""
df_train  : {df_train.shape}
df_test   : {df_test.shape}
Features: {df_train.columns.tolist()}
""")
display(df_train.head(2).append(df_train.tail(2)))
df_train : (127656, 8) df_test : (31915, 8) Features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
| id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8d603d50affa1126 | "\nYes, aside, thank you for trying to answer ... | 0 | 0 | 0 | 0 | 0 | 0 | 
| 1 | 8fb3576937b9e0d0 | March 2010 (UTC)\n\nThanks! and understood abo... | 0 | 0 | 0 | 0 | 0 | 0 | 
| 127654 | 95df37d4a69b607d | I am assuming that there is no point trying to... | 0 | 0 | 0 | 0 | 0 | 0 | 
| 127655 | 668ba87c1b6a3f31 | "\nPlus, take a look! Have I made any outing ... | 0 | 0 | 0 | 0 | 0 | 0 | 
# debug
# df_train = df_train.sample(n=50_000,random_state=SEED)
dir_embed_file = os.path.join(os.path.expanduser('~'),'Datasets','NLP')
if ENV_COLAB:
    dir_embed_file = colab_dat_dir + 'NLP'
    print(dir_embed_file)
path_embed_file = os.path.join(dir_embed_file,'crawl-300d-2M.vec')
print(path_embed_file)
/Users/poudel/Datasets/NLP/crawl-300d-2M.vec
!ls "$dir_embed_file"
crawl-300d-2M.vec crawl-300d-2M.vec.zip
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
def standardize_text(df, text_field):
    """
    Use a few regular expressions to clean up pour data.
    """
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df
def similarity(s1, s2):
    """
    Find Similarity between two strings.
    Return a measure of the sequences' similarity (float in [0,1]).
    """
    return SequenceMatcher(None, s1, s2).ratio()
def is_english_word(dictionary, word):
    """
    Check if a word is an English word.
    """
    try:
        x = dictionary[word]
        return True
    except KeyError:
        return False
def normalize_bad_word(word_list, dictionary, bad_words, threshold):
    """
    Return a list of normalized words.
    """
    res = []
    for word in word_list:
        found = False
        normalizedBadWord = ""
        for badword in bad_words:
            #if(similarity(badword, word) > threshold):
            if(badword in word):
                found = True
                normalizedBadWord = badword
                break;                
        if(found):
            res.append(normalizedBadWord)      
        else:
            if(is_english_word(dictionary, word)):
                res.append(word)        
    #print(res) 
    return res 
%%time
# TRAIN
processing = """
1. fill nans by _NA_
2. standardize text
(r"http\S+", "")
(r"http", "")
(r"@\S+", "")
(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
(r"@", "at")
lower()
3. tokenize (RegexpTokenizer(r'[a-zA-Z]+'))
4. remove stopwords
5. normalize bad words
6. save clean file
7. combine tokens by " "
"""
def create_tokens(dfx,col_txt,
                  tokenizer,
                  stop_words,
                  dictionary,
                  bad_words,
                  print_=True
                  ):
    dfx = dfx.copy()
    # fill nans
    dfx[col_txt] = dfx[col_txt].fillna('_NA_')
    
    # standardize
    dfx = standardize_text(dfx, col_txt)
    
    # create tokens column
    dfx["tokens"] = dfx[col_txt].apply(tokenizer.tokenize)
    # remove stopwords
    dfx["tokens"] = dfx["tokens"].apply(
        lambda vec: [word for word in vec if word not in stop_words])
    # normalize bad words    
    dfx["tokens"] = dfx["tokens"].apply(
        lambda vec: normalize_bad_word(vec, dictionary, bad_words, 0.5))
    # print info
    if print_:
        all_words = [word for tokens in dfx["tokens"] for word in tokens]
        sent_lengths = [len(tokens) for tokens in dfx["tokens"]]
        VOCAB = sorted(list(set(all_words)))
        
        print(f"""
        Total words     : {len(all_words):,}
        Vocab size      : {len(VOCAB):,}
        Len longest sent: {max(sent_lengths)}
        Mean sent length: {np.mean(sent_lengths):.2f}
        """)
    # join words by space
    dfx["tokens"] = dfx["tokens"].apply(lambda vec :' '.join(vec))
    
    return dfx
CPU times: user 3 µs, sys: 0 ns, total: 3 µs Wall time: 4.05 µs
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] /Users/poudel/nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
%%time
print('Preparing Dictionary...')
# Read the FastText word vectors (space delimited strings) into a dictionary from word->vector
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(path_embed_file, encoding="utf8"))
print("embeddings_index size: ", len(embeddings_index))
dictionary = dict.fromkeys(embeddings_index, None)
print("Dictionary size: ", len(dictionary))
# Wall time: 1min 49s
Preparing Dictionary... embeddings_index size: 2000000 Dictionary size: 2000000 CPU times: user 1min 49s, sys: 3.89 s, total: 1min 53s Wall time: 1min 59s
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
bad_words = ['sex', 'suck', 'anal', 'penis', 'shit', 'fuck', 'damn',
    'bitch', 'crap', 'piss', 'dick', 'darn', 'cock', 'pussy','ass',
    'asshole', 'fag', 'bastard', 'slut', 'douche','bastard', 'darn',
    'bloody', 'bugger', 'bollocks', 'arsehole','nigger', 'nigga',
    'moron', 'gay', 'antisemitism', 'anti','nazi', 'poop']
print("num bad words: ", len(bad_words))
num bad words: 34
# This will create new column "tokens"
df_train = create_tokens(df_train,
                         col_txt='comment_text',
                         tokenizer=tokenizer,
                         stop_words=stop_words,
                         dictionary=dictionary,
                         bad_words=bad_words,
                         print_=True
                         )
df_test = create_tokens(df_test,
                         col_txt='comment_text',
                         tokenizer=tokenizer,
                         stop_words=stop_words,
                         dictionary=dictionary,
                         bad_words=bad_words,
                         print_=True
                         )
df_train.head()
        Total words     : 4,279,556
        Vocab size      : 86,638
        Len longest sent: 1250
        Mean sent length: 33.52
        
        Total words     : 1,044,479
        Vocab size      : 46,933
        Len longest sent: 1250
        Mean sent length: 32.73
        
| id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | tokens | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8d603d50affa1126 | "\nyes, aside, thank you for trying to answer ... | 0 | 0 | 0 | 0 | 0 | 0 | yes aside thank trying answer block related co... | 
| 1 | 8fb3576937b9e0d0 | march 2010 (utc)\n\nthanks! and understood abo... | 0 | 0 | 0 | 0 | 0 | 0 | march utc thanks understood tags advice better... | 
| 2 | 379440e04fb68e27 | "\n\n the outfield \n\nhahaha compassion is ... | 0 | 0 | 0 | 0 | 0 | 0 | outfield hahaha ass vested considering er kind... | 
| 3 | 6be4446aac8ae028 | opposition is a source of strength i believe ... | 0 | 0 | 0 | 0 | 0 | 0 | opposition source strength believe al said | 
| 4 | 1a2ff7ed958506a3 | please discontinue making those unsupported ch... | 0 | 0 | 0 | 0 | 0 | 0 | please discontinue making unsupported changes ... | 
max_features = 80_000  # how many unique words to use (i.e num rows in embedding vector)
maxlen = 150 # max number of words in a comment to use
embed_size = 300 # how big is each word vector
%%time
from keras.preprocessing.text import Tokenizer
# Turn each comment into a list of word indexes
# of equal length (with truncation or padding as needed)
list_classes = ["toxic", "severe_toxic", "obscene",
                "threat", "insult", "identity_hate"]
list_sentences_train = list(df_train["tokens"].to_numpy())
list_sentences_test  = list(df_test["tokens"].to_numpy())
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train)
                       + list(list_sentences_test))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
Xtrain_text = pad_sequences(list_tokenized_train, maxlen=maxlen)
Xtest_text = pad_sequences(list_tokenized_test, maxlen=maxlen)
CPU times: user 8.45 s, sys: 74.9 ms, total: 8.53 s Wall time: 9.12 s
%%time
path_missing_words = '../outputs/word_not_found.csv'
# BUILD EMBEDDING MATRIX    
print('Preparing embedding matrix...')
word_index = tokenizer.word_index
print("word_index size: ", len(word_index))
words_not_found = []
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():        
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0 :
        embedding_matrix[i-1] = embedding_vector
    else:
        words_not_found.append(word)
if(len(words_not_found)>0):        
    print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    print("sample words not found: ", np.random.choice(words_not_found, 10))
    df_missing_words = pd.DataFrame(words_not_found)
    df_missing_words.to_csv(path_missing_words, header=None, index=False)
    
    
# prints
print(f'Number of words: {nb_words}')
Preparing embedding matrix... word_index size: 94298 Number of words: 80000 CPU times: user 278 ms, sys: 378 ms, total: 656 ms Wall time: 708 ms
dict(list(word_index.items())[0: 3]) 
# number starts with 1 not 0, so we use,
# embedding_matrix[i-1] = embedding_vector
{'article': 1, 'page': 2, 'wikipedia': 3}
dict(list(word_index.items())[-3:])
{'piezo': 94296, 'abouta': 94297, 'andhave': 94298}
len(word_index), embedding_matrix.shape
(94298, (80000, 300))
from keras.callbacks import Callback
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data
        # we must have variable names X_val and y_val
    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
def get_model(maxlen, max_features, embed_size, embedding_matrix,
              lr=0.0, lr_d=0.0, units=0, dr=0.0):
    """Get the tensorflow model.
    References:
    - http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
    - For text, CNN -> LSTM (or GRU) doesn't seem to work well, but LSTM -> CNN works really well.
    """
    inp = Input(shape=(maxlen,))
    # Ref: https://keras.io/api/layers/core_layers/embedding/
    # embedding needs input_dim =  maximum integer index + 1.
    x = Embedding(max_features, embed_size,
                  weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(units, return_sequences=True, dropout=dr, recurrent_dropout=dr))(x)
    x = Conv1D(filters=64, kernel_size=2, padding='valid', kernel_initializer="he_uniform")(x)
    x = Dropout(dr)(x)
    # x = MaxPooling1D(pool_size=2)(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                    optimizer=Adam(lr=lr, decay=lr_d),
                    metrics=['accuracy']) 
    return model
model = get_model(maxlen,nb_words,embed_size,embedding_matrix,
                  lr=1e-3,lr_d=0,units=128, dr=0.5)
model.summary()
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 300)     24000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 150, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 150, 256)     330240      spatial_dropout1d[0][0]          
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 149, 64)      32832       bidirectional[0][0]              
__________________________________________________________________________________________________
dropout (Dropout)               (None, 149, 64)      0           conv1d[0][0]                     
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 64)           0           dropout[0][0]                    
__________________________________________________________________________________________________
global_max_pooling1d (GlobalMax (None, 64)           0           dropout[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 128)          0           global_average_pooling1d[0][0]   
                                                                 global_max_pooling1d[0][0]       
__________________________________________________________________________________________________
dense (Dense)                   (None, 6)            774         concatenate[0][0]                
==================================================================================================
Total params: 24,363,846
Trainable params: 24,363,846
Non-trainable params: 0
__________________________________________________________________________________________________
Xtrain = Xtrain_text
Xtest  = Xtest_text
list_classes = ["toxic", "severe_toxic", "obscene",
                "threat", "insult", "identity_hate"]
ytrain = df_train[list_classes].to_numpy()
ytest  = df_test[list_classes].to_numpy()
print(f"""
df_train :{df_train.shape}   df_test:{df_test.shape}
Xtrain   :{Xtrain.shape} Xtest  :{Xtest.shape}
ytrain   :{ytrain.shape}   ytest  :{ytest.shape}
 
""")
df_train :(127656, 9) df_test:(31915, 9) Xtrain :(127656, 150) Xtest :(31915, 150) ytrain :(127656, 6) ytest :(31915, 6)
%%time
batch_size = 128
epochs = 2
interval=1
path_model_checkpoint = '../outputs/keras_gru_fasttext_badwords.h5'
X_tra, X_val, y_tra, y_val = train_test_split(Xtrain, ytrain,
                                        test_size=0.1, random_state=SEED)
validation_data=(X_val, y_val) # names must be X_val and y_val
                               # as used in class RocAucEvaluation
# callbacks
cb_rocauc = RocAucEvaluation(
    validation_data=validation_data,
    interval=interval)
cb_early = EarlyStopping(
            monitor='val_acc',
            patience=5,
            mode='max',
            verbose=1)
cb_check = ModelCheckpoint(
            path_model_checkpoint,
            monitor='val_acc',
            save_best_only=True,
            mode='max',
            verbose=0)
callbacks = [cb_rocauc, cb_early, cb_check]
path_keras_model = '../outputs/keras_fasttext_model.h5'
DO_TRAIN = False
if DO_TRAIN:
    history = model.fit(X_tra, y_tra,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=validation_data,
                        callbacks=callbacks,
                        verbose=2,
                        shuffle=True
                       )
    model.save(path_keras_model)
# Wall time: Wall time: 20min 18s
CPU times: user 40.9 ms, sys: 5.86 ms, total: 46.8 ms Wall time: 75.2 ms
!du -sh $path_keras_model
280M ../outputs/keras_fasttext_model.h5
%%time
path_yprobs_keras = '../outputs/yprobs_keras_fasttext.npz'
if DO_TRAIN:
    model = keras.models.load_model(path_keras_model)
    yprobs = model.predict(Xtest_text,batch_size=1024,verbose=2)
    np.savez_compressed(path_yprobs_keras, yprobs=yprobs)
# Wall time: 1min 33s
CPU times: user 3 µs, sys: 1 µs, total: 4 µs Wall time: 6.2 µs
yprobs = np.load(path_yprobs_keras)['yprobs']
yprobs[:5]
array([[0.01838818, 0.00341764, 0.02119815, 0.00906962, 0.01449135,
        0.00419179],
       [0.01054636, 0.00153401, 0.01218945, 0.00453353, 0.00753298,
        0.00132418],
       [0.00564146, 0.00059021, 0.00540674, 0.00278851, 0.00426516,
        0.00168946],
       [0.00870907, 0.00088844, 0.00853044, 0.00390193, 0.0061962 ,
        0.0016318 ],
       [0.01302436, 0.00158405, 0.00876519, 0.00363001, 0.00854704,
        0.00241932]], dtype=float32)
labels = ['toxic','severe_toxic','obscene',
          'threat','insult','identity_hate']
ypreds = (yprobs==yprobs.max(axis=1,keepdims=True)).astype(np.int8)
ypreds[:5]
array([[0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]], dtype=int8)
ytest.shape, yprobs.shape, ypreds.shape
((31915, 6), (31915, 6), (31915, 6))
from sklearn.metrics import multilabel_confusion_matrix
mcm = multilabel_confusion_matrix(ytest, ypreds)
mcm
array([[[ 6919, 21904],
        [ 1211,  1881]],
       [[31602,     0],
        [  313,     0]],
       [[23886,  6362],
        [  707,   960]],
       [[31804,    12],
        [   99,     0]],
       [[30234,    96],
        [ 1585,     0]],
       [[30950,   696],
        [  264,     5]]])
ytest.shape[0], mcm.sum(axis=1).sum(axis=1)
(31915, array([31915, 31915, 31915, 31915, 31915, 31915]))
for i in range(6):
    skplt.metrics.plot_confusion_matrix(ytest[:,i],ypreds[:,i],title=labels[i])
r = sklearn.metrics.classification_report(ytest,ypreds)
print(r)
              precision    recall  f1-score   support
           0       0.08      0.61      0.14      3092
           1       0.00      0.00      0.00       313
           2       0.13      0.58      0.21      1667
           3       0.00      0.00      0.00        99
           4       0.00      0.00      0.00      1585
           5       0.01      0.02      0.01       269
   micro avg       0.09      0.41      0.15      7025
   macro avg       0.04      0.20      0.06      7025
weighted avg       0.07      0.41      0.11      7025
 samples avg       0.09      0.05      0.06      7025
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
# check the evaluation for toxic (0th label)
df_toxic = pd.DataFrame({'ytest': ytest[:,0], 'ypred': ypreds[:,0]})
df_toxic.query('ytest == 1').head(4)
| ytest | ypred | |
|---|---|---|
| 13 | 1 | 0 | 
| 22 | 1 | 1 | 
| 31 | 1 | 1 | 
| 32 | 1 | 0 | 
pd.crosstab(df_toxic['ytest'],df_toxic['ypred'])
| ypred | 0 | 1 | 
|---|---|---|
| ytest | ||
| 0 | 6919 | 21904 | 
| 1 | 1211 | 1881 | 
pre0 = sklearn.metrics.precision_score(ytest[:,0],ypreds[:,0])
pre1 = sklearn.metrics.precision_score(ytest[:,1],ypreds[:,1])
pre2 = sklearn.metrics.precision_score(ytest[:,2],ypreds[:,2])
pre0, pre1, pre2
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
(0.07908345595963842, 0.0, 0.13111171810980607)
coo = ytest.T.dot(ypreds)
coo
array([[1881,    0, 1193,    2,    2,   14],
       [ 138,    0,  174,    0,    0,    1],
       [ 703,    0,  960,    0,    0,    4],
       [  76,    0,   21,    0,    0,    2],
       [ 894,    0,  688,    0,    0,    3],
       [ 208,    0,   56,    0,    0,    5]])
df_coo = pd.DataFrame(coo, columns=labels,index=labels)
df_coo.to_csv('../outputs/keras_confusion_matrix.csv',index=False)
df_coo.style.background_gradient()
| toxic | severe_toxic | obscene | threat | insult | identity_hate | |
|---|---|---|---|---|---|---|
| toxic | 1881 | 0 | 1193 | 2 | 2 | 14 | 
| severe_toxic | 138 | 0 | 174 | 0 | 0 | 1 | 
| obscene | 703 | 0 | 960 | 0 | 0 | 4 | 
| threat | 76 | 0 | 21 | 0 | 0 | 2 | 
| insult | 894 | 0 | 688 | 0 | 0 | 3 | 
| identity_hate | 208 | 0 | 56 | 0 | 0 | 5 | 
df_coo2 = df_coo.copy()
df_coo2['Total'] = df_coo2.sum(axis=1)
df_coo2.loc[len(df_coo2),:] = df_coo2.sum(axis=0)
df_coo2.index = df_coo.index.tolist() + ['Total']
df_coo2 = df_coo2.astype(int)
# horizontal is true, vertical is predicted
df_coo2
| toxic | severe_toxic | obscene | threat | insult | identity_hate | Total | |
|---|---|---|---|---|---|---|---|
| toxic | 1881 | 0 | 1193 | 2 | 2 | 14 | 3092 | 
| severe_toxic | 138 | 0 | 174 | 0 | 0 | 1 | 313 | 
| obscene | 703 | 0 | 960 | 0 | 0 | 4 | 1667 | 
| threat | 76 | 0 | 21 | 0 | 0 | 2 | 99 | 
| insult | 894 | 0 | 688 | 0 | 0 | 3 | 1585 | 
| identity_hate | 208 | 0 | 56 | 0 | 0 | 5 | 269 | 
| Total | 3900 | 0 | 3092 | 2 | 2 | 29 | 7025 | 
from util_ds import highlight_rcd
highlight_rcd(df_coo2)
| toxic | severe_toxic | obscene | threat | insult | identity_hate | Total | |
|---|---|---|---|---|---|---|---|
| toxic | 1881 | 0 | 1193 | 2 | 2 | 14 | 3092 | 
| severe_toxic | 138 | 0 | 174 | 0 | 0 | 1 | 313 | 
| obscene | 703 | 0 | 960 | 0 | 0 | 4 | 1667 | 
| threat | 76 | 0 | 21 | 0 | 0 | 2 | 99 | 
| insult | 894 | 0 | 688 | 0 | 0 | 3 | 1585 | 
| identity_hate | 208 | 0 | 56 | 0 | 0 | 5 | 269 | 
| Total | 3900 | 0 | 3092 | 2 | 2 | 29 | 7025 | 
from util_multilabel import plot_confusion_matrix
plot_confusion_matrix(coo, target_names=labels,
                      cmap='Reds',normalize=False)
from util_plotly import plot_coo_matrix
plot_coo_matrix(labels,coo)
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning: plotly.graph_objs.Margin is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.layout.Margin
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 2 min 41 secs