import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
SEED=100
np.random.seed(SEED)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 500)

import IPython
from IPython.display import display, HTML, Image, Markdown

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.17.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.2')]


import re
import string # string.punctuation
import string
from string import digits


import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ENV_BHISHAN = None

try:
    import bhishan
    ENV_BHISHAN = True
    print("Environment: Bhishan's Laptop")
except:
    pass


import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # load google drive
    # from google.colab import drive
    # drive.mount('/content/drive')
    # dat_dir = 'drive/My Drive/Colab Notebooks/data/' 
    # sys.path.append(dat_dir)
    
    # pip install
    #!pip install pyldavis
    #!pip install hyperopt
    #!pip install catboost
    !pip install lime

    # upgrade tqdm and restart to use df.progess_apply()
    # !pip install -U tqdm
    
    # print
    print('Environment: Google Colaboratory.')

Collecting lime
  Downloading https://files.pythonhosted.org/packages/e5/72/4be533df5151fcb48942515e95e88281ec439396c48d67d3ae41f27586f0/lime-0.1.1.36.tar.gz (275kB)
     |████████████████████████████████| 276kB 8.8MB/s 
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from lime) (1.17.4)
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from lime) (1.3.3)
Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.6/dist-packages (from lime) (0.21.3)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from lime) (3.1.2)
Requirement already satisfied: scikit-image>=0.12 in /usr/local/lib/python3.6/dist-packages (from lime) (0.15.0)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.18->lime) (0.14.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->lime) (2.4.5)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->lime) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->lime) (2.6.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->lime) (1.1.0)
Requirement already satisfied: imageio>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.12->lime) (2.4.1)
Requirement already satisfied: PyWavelets>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.12->lime) (1.1.1)
Requirement already satisfied: networkx>=2.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.12->lime) (2.4)
Requirement already satisfied: pillow>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.12->lime) (4.3.0)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib->lime) (1.12.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib->lime) (42.0.1)
Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx>=2.0->scikit-image>=0.12->lime) (4.4.1)
Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from pillow>=4.3.0->scikit-image>=0.12->lime) (0.46)
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... done
  Created wheel for lime: filename=lime-0.1.1.36-cp36-none-any.whl size=284191 sha256=4c30f9573311d20acf614a69ea6ba74996e1c14d200ff0e4baf405311f8f586b
  Stored in directory: /root/.cache/pip/wheels/a9/2f/25/4b2127822af5761dab9a27be52e175105772aebbcbc484fb95
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.1.1.36
Environment: Google Colaboratory.


import scipy


# pipeline
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, ClassifierMixin


# text features
from sklearn.feature_extraction.text import TfidfVectorizer


# classifiers
from sklearn.linear_model import LogisticRegression


# model selection
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score


def show_method_attributes(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """

    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')


ifile1_train = '../data/raw/train.csv'
ifile2_train = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/train.csv?raw=true'

ifile1_test = '../data/raw/test.csv'
ifile2_test = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/test.csv?raw=true'

if ENV_BHISHAN:
    df_raw_train = pd.read_csv(ifile1_train)
    df_raw_test = pd.read_csv(ifile1_test)


if ENV_COLAB:
    df_raw_train = pd.read_csv(ifile2_train)
    df_raw_test = pd.read_csv(ifile2_test)
    pass


df_train = df_raw_train.copy()
df_test = df_raw_test.copy()

print(df_train.shape)
df_train.head()

(159571, 8)


print(df_test.shape)
df_test.head()

(153164, 2)


df_train['clean'] = df_train.loc[:,'toxic':'identity_hate'].sum(axis=1) == 0

print(df_train['clean'].sum())

df_train.head(2)

143346


ser_sum = df_train.iloc[:,2:].sum(axis=0)
ser_sum

toxic             15294.0
severe_toxic       1595.0
obscene            8449.0
threat              478.0
insult             7877.0
identity_hate      1405.0
clean            143346.0
dtype: float64


ax = ser_sum.sort_values().plot.bar(color=sns.color_palette('husl',len(ser_sum)))

for p in ax.patches:
    x,y = p.get_x(), p.get_height()
    ax.text(x,y,f'{y:,.0f}',fontsize=14,color='blue')


Xtrain = df_train.loc[:5000, 'comment_text'].to_numpy()
ytrain = df_train.loc[:5000, 'toxic'].to_numpy()

ytrain[:5]

array([0, 0, 0, 0, 0])


# default params
scoring='roc_auc'
cv=3
n_jobs=-1
max_features = 2500


tfidf = TfidfVectorizer(max_features=max_features)
lr = LogisticRegression(solver='lbfgs')
p = Pipeline([
    ('tfidf', tfidf),
    ('lr', lr)
])

cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)

cvs.mean().round(4)

0.9227


import scipy

class NBFeaturer(BaseEstimator):
    def __init__(self, alpha):
        self.alpha = alpha
    
    def preprocess_x(self, x, r):
        return x.multiply(r)
    
    def pr(self, x, y_i, y):
        p = x[y==y_i].sum(0)
        return (p+self.alpha) / ((y==y_i).sum()+self.alpha)

    def fit(self, x, y=None):
        self._r = scipy.sparse.csr_matrix(np.log(self.pr(x,1,y) / self.pr(x,0,y)))
        return self
    
    def transform(self, x):
        x_nb = self.preprocess_x(x, self._r)
        return x_nb


tfidf = TfidfVectorizer(max_features=max_features)
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
    ('tfidf', tfidf),
    ('nb', nb),
    ('lr', lr)
])

cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)

0.9246


class Lemmatizer(BaseEstimator):
    def __init__(self):
        self.l = WordNetLemmatizer()
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x = map(lambda r:  ' '.join([self.l.lemmatize(i.lower()) for i in r.split()]), x)
        x = np.array(list(x))
        return x


lm = Lemmatizer()
tfidf = TfidfVectorizer(max_features=max_features)
lr = LogisticRegression()
nb = NBFeaturer(1)

p = Pipeline([
    ('lm', lm),
    ('tfidf', tfidf),
    ('nb', nb),
    ('lr', lr)
])

cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)

0.9258


max_features = 2500
lm = Lemmatizer()
tfidf_w = TfidfVectorizer(max_features=max_features, analyzer='word')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char')
lr = LogisticRegression()
nb = NBFeaturer(1)

p = Pipeline([
    ('lm', lm),
    ('wc_tfidfs', 
         FeatureUnion([
            ('tfidf_w', tfidf_w), 
            ('tfidf_c', tfidf_c), 
         ])
    ),
    ('nb', nb),
    ('lr', lr)
])

cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)

0.9369


param_grid = [{
    'wc_tfidfs__tfidf_w__max_features': [2500], 
    'wc_tfidfs__tfidf_c__stop_words': [2500, 5000],
    'lr__C': [0.01, 0.1, 1.0, 2.0, 3.],
}]

grid = GridSearchCV(p, cv=cv, n_jobs=n_jobs, param_grid=param_grid, scoring=scoring, 
                            return_train_score=False, verbose=0)
grid.fit(Xtrain, ytrain)
grid.best_params_

/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

{'lr__C': 0.1,
 'wc_tfidfs__tfidf_c__stop_words': 2500,
 'wc_tfidfs__tfidf_w__max_features': 2500}


param_grid = [{
    'wc_tfidfs__tfidf_w__max_features': [2500, 5000, 10000], 
    'wc_tfidfs__tfidf_c__stop_words': [2500, 5000, 10000],
    'lr__C': [1., 3., 4.],
}]

grid = RandomizedSearchCV(p, cv=cv, n_jobs=n_jobs, param_distributions=param_grid[0], n_iter=5, 
                          scoring=scoring, return_train_score=False, verbose=1)
grid.fit(Xtrain, ytrain)
grid.best_params_

Fitting 3 folds for each of 5 candidates, totalling 15 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   32.8s finished
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

{'lr__C': 1.0,
 'wc_tfidfs__tfidf_c__stop_words': 5000,
 'wc_tfidfs__tfidf_w__max_features': 10000}


max_features = 2500
lm = Lemmatizer()
tfidf_w = TfidfVectorizer(max_features=max_features, analyzer='word')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char',stop_words=2500)
lr = LogisticRegression(C=1.0)
nb = NBFeaturer(1)

p = Pipeline([
    ('lm', lm),
    ('wc_tfidfs', 
         FeatureUnion([
            ('tfidf_w', tfidf_w), 
            ('tfidf_c', tfidf_c), 
         ])
    ),
    ('nb', nb),
    ('lr', lr)
])

cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)

0.9369


df_train.shape, df_test.shape

((159571, 9), (153164, 2))


Xtest = df_train.loc[5000:10_000, 'comment_text'].to_numpy()
ytest = df_train.loc[5000:10_000, 'toxic'].to_numpy()


max_features = 2500
lm = Lemmatizer()
tfidf_w = TfidfVectorizer(max_features=max_features, analyzer='word')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char',stop_words=2500)
lr = LogisticRegression(C=3.0,solver='lbfgs')
nb = NBFeaturer(1)

p = Pipeline([
    ('lm', lm),
    ('wc_tfidfs', 
         FeatureUnion([
            ('tfidf_w', tfidf_w), 
            ('tfidf_c', tfidf_c), 
         ])
    ),
    ('nb', nb),
    ('lr', lr)
])

p.fit(Xtrain,ytrain);


# show_method_attributes(p)


ypreds = p.predict(Xtest)


ytest[0], ypreds[0]

(1, 1)


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy: ', accuracy_score(ytest,ypreds))
print('Precision: ', precision_score(ytest,ypreds))
print('Recall: ', recall_score(ytest,ypreds))
print('F1-score: ', f1_score(ytest,ypreds))

Accuracy:  0.9516096780643871
Precision:  0.9154411764705882
Recall:  0.532051282051282
F1-score:  0.672972972972973


from sklearn.metrics import classification_report

print(classification_report(ytest, ypreds))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      4533
           1       0.92      0.53      0.67       468

    accuracy                           0.95      5001
   macro avg       0.93      0.76      0.82      5001
weighted avg       0.95      0.95      0.95      5001


from sklearn.metrics import confusion_matrix

confusion_matrix(ytest, ypreds)

array([[4510,   23],
       [ 219,  249]])


from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

def plot_roc_curve(clf_names, all_roc_auc_scores, y,ofile=None):
    """Plot Receiver Operating Characteristic (ROC) curve.
    
    NOTE:
    In cross-validation we do not need Xtrain and Xtest,
    one split of X is taken as Xtest and remaining splits are taken as Xtrain.
    
    Example:
    ---------
    clf_names = ["Logisitic Regression","Support Vector Classifier",
                 "KNN", "Decision Tree Classifier","Random Forest Classifier"]

    cvpred_lr = cross_val_predict(clf_best_lr, X, y, cv=5,
                             method="decision_function")
    cvpred_svc = cross_val_predict(clf_best_svc, X, y, cv=5,
                                 method="decision_function")

    cvpred_knn = cross_val_predict(clf_best_knn,X,y,cv=5)
    cvpred_dtc = cross_val_predict(clf_best_dtc,X,y,cv=5)
    cvpred_rfc = cross_val_predict(clf_best_rfc, X, y, cv=5)

    all_cross_val_preds = [cvpred_lr, cvpred_svc,
                             cvpred_knn, cvpred_dtc, cvpred_rfc]

    all_roc_auc_scores = [roc_auc_score(y, cvpred)
                        for cvpred in all_cross_val_preds ]

    plot_roc_curve(clf_names, all_roc_auc_scores,y)
    
    """
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve

    labels = ['{}: {:.4f}'.format(clf,pred)
              for clf,pred in zip(clf_names,all_roc_auc_scores)]

    fpr_tpr_threshold = [ roc_curve(y, pred_x)
                         for pred_x in all_cross_val_preds ]

    plt.figure(figsize=(12,8))
    for i in range(len(fpr_tpr_threshold)):
        plt.plot(fpr_tpr_threshold[i][0],
                 fpr_tpr_threshold[i][1],
                 label=labels[i])

    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1])
    plt.xlabel('False Positive Rate (FPR)', fontsize=16)
    plt.ylabel('True Positive Rate (TPR)', fontsize=16)
    plt.annotate('Minimum ROC Score of 50%',
                 xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.title('ROC Curves',fontsize=20)
    plt.legend()
    plt.tight_layout()
    
    # save the figure
    if ofile:
        plt.savefig(ofile,dpi=300)
        
    # show the figure
    plt.show()
    plt.close()


clf_names = ["Logisitic Regression"]

cvpred = cross_val_predict(p, Xtrain, ytrain, cv=5,
                         method="decision_function")


all_cross_val_preds = [cvpred]

all_roc_auc_scores = [roc_auc_score(ytrain, cvpred)
                    for cvpred in all_cross_val_preds ]

plot_roc_curve(clf_names, all_roc_auc_scores,ytrain)


from sklearn.pipeline import make_pipeline

from lime import lime_text
from lime.lime_text import LimeTextExplainer


p.fit(Xtrain,ytrain);

class_names = ['Non-Toxic', 'Toxic']

explainer = LimeTextExplainer(class_names=class_names)

idx = 0
exp = explainer.explain_instance(Xtrain[idx], p.predict_proba, top_labels=2)

exp.show_in_notebook(text=Xtrain[idx], labels=(0,1))

/usr/local/lib/python3.6/dist-packages/lime/lime_text.py:116: FutureWarning: split() requires a non-empty pattern match.
  self.as_list = [s for s in splitter.split(self.raw) if s]


df_train.head(1)

	id	comment_text
0	0000997932d777bf	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)
2	000113f07ec002fd	Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
3	0001b41b1c6bb37e	"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents"" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess t...
4	0001d958c54c6e35	You, sir, are my hero. Any chance you remember what page that's on?

	id	comment_text
0	00001cee341fdb12	Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,
1	0000247867823ef7	== From RfC == \n\n The title is fine as it is, IMO.
2	00013b17ad220c46	" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / "
3	00017563c3f7919a	:If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.
4	00017695ad8997eb	I don't anonymously edit articles at all.

Table of Contents

Introduction¶

Imports¶

Useful Scripts¶

Load the Data¶

Class distribution¶

Train Target Split¶

Text Modelling: Logistic Regression¶

Baseline Logistic Regression¶

Model 2: NBFeaturer¶

Model 3: NBFeaturer + Lemmatizer¶

Model 4: NBFeaturer + Lemmatizer + two tfidfs¶

Grid Search¶

Using Best Parameters¶

Testing the Model¶

Model Evaluation for Classification¶

Model Explanation¶

	id	comment_text	toxic	severe_toxic	obscene	threat	insult	identity_hate	clean
0	0000997932d777bf	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27	0	0	0	0	0	0	True
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)	0	0	0	0	0	0	True