import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
SEED=100
np.random.seed(SEED)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 500)

import IPython
from IPython.display import display, HTML, Image, Markdown

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.17.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.2')]


import re
import string # string.punctuation
import string
from string import digits


import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


ENV_BHISHAN = None

try:
    import bhishan
    ENV_BHISHAN = True
    print("Environment: Bhishan's Laptop")
except:
    pass


import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # load google drive
    # from google.colab import drive
    # drive.mount('/content/drive')
    # dat_dir = 'drive/My Drive/Colab Notebooks/data/' 
    # sys.path.append(dat_dir)
    
    # pip install
    #!pip install pyldavis
    #!pip install hyperopt
    # !pip install catboost
    #!pip install shap
    #!pip install eli5
    #!pip install lime
    # !pip install category_encoders # TargetEncoder
    # !pip install loguru

    # nlp
    !pip install textacy
    !python -m spacy download en_core_web_sm # nlp = spacy.load("en_core_web_sm")

    # update modules
    # !pip install -U pandas
    # !pip install -U scikit-learn
    # !pip install -U tqdm # tqdm needs restart run time.

    # print
    print('Environment: Google Colaboratory.')

Requirement already satisfied: textacy in /usr/local/lib/python3.6/dist-packages (0.9.1)
Requirement already satisfied: scikit-learn>=0.19.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (0.21.3)
Requirement already satisfied: jellyfish>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (0.7.2)
Requirement already satisfied: requests>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (2.21.0)
Requirement already satisfied: cachetools>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from textacy) (3.1.1)
Requirement already satisfied: tqdm>=4.19.6 in /usr/local/lib/python3.6/dist-packages (from textacy) (4.28.1)
Requirement already satisfied: srsly>=0.0.5 in /usr/local/lib/python3.6/dist-packages (from textacy) (0.2.0)
Requirement already satisfied: numpy>=1.17.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (1.17.4)
Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (1.3.3)
Requirement already satisfied: cytoolz>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (0.10.1)
Requirement already satisfied: pyphen>=0.9.4 in /usr/local/lib/python3.6/dist-packages (from textacy) (0.9.5)
Requirement already satisfied: pyemd>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (0.5.1)
Requirement already satisfied: spacy>=2.0.12 in /usr/local/lib/python3.6/dist-packages (from textacy) (2.1.9)
Requirement already satisfied: joblib>=0.13.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (0.14.0)
Requirement already satisfied: networkx>=2.0 in /usr/local/lib/python3.6/dist-packages (from textacy) (2.4)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.10.0->textacy) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.10.0->textacy) (2019.11.28)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.10.0->textacy) (2.8)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.10.0->textacy) (1.24.3)
Requirement already satisfied: toolz>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from cytoolz>=0.8.0->textacy) (0.10.0)
Requirement already satisfied: plac<1.0.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.12->textacy) (0.9.6)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.12->textacy) (1.0.2)
Requirement already satisfied: thinc<7.1.0,>=7.0.8 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.12->textacy) (7.0.8)
Requirement already satisfied: preshed<2.1.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.12->textacy) (2.0.1)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.12->textacy) (2.0.3)
Requirement already satisfied: wasabi<1.1.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.12->textacy) (0.4.2)
Requirement already satisfied: blis<0.3.0,>=0.2.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.12->textacy) (0.2.4)
Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx>=2.0->textacy) (4.4.1)
Requirement already satisfied: en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0 in /usr/local/lib/python3.6/dist-packages (2.1.0)
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
Environment: Google Colaboratory.


import scipy


# pipeline
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, ClassifierMixin


# text features
from sklearn.feature_extraction.text import TfidfVectorizer


# classifiers
from sklearn.linear_model import LogisticRegression


# model selection
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score


def show_method_attributes(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """

    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')


ifile1_train = '../data/raw/train.csv'
ifile2_train = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/train.csv?raw=true'

ifile1_test = '../data/raw/test.csv'
ifile2_test = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/test.csv?raw=true'

if ENV_BHISHAN:
    df_raw_train = pd.read_csv(ifile1_train)
    df_raw_test = pd.read_csv(ifile1_test)


if ENV_COLAB:
    df_raw_train = pd.read_csv(ifile2_train)
    df_raw_test = pd.read_csv(ifile2_test)
    pass


df_train = df_raw_train.copy()
df_test = df_raw_test.copy()

print(df_train.shape)
df_train.head()

(159571, 8)


print(df_test.shape)
df_test.head()

(153164, 2)


df_train['clean'] = df_train.loc[:,'toxic':'identity_hate'].sum(axis=1) == 0

print(df_train['clean'].sum())

df_train.head(2)

143346


ser_sum = df_train.iloc[:,2:].sum(axis=0)
ser_sum

toxic             15294.0
severe_toxic       1595.0
obscene            8449.0
threat              478.0
insult             7877.0
identity_hate      1405.0
clean            143346.0
dtype: float64


ax = ser_sum.sort_values().plot.bar(color=sns.color_palette('husl',len(ser_sum)))

for p in ax.patches:
    x,y = p.get_x(), p.get_height()
    ax.text(x,y,f'{y:,.0f}',fontsize=14,color='blue')


df_train['text'] = df_train['comment_text'].apply(lambda x: x.replace('\n', ' '))


cats = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

lst_train_prepared = []

def format_text_spacy(text):
    return (text.text, {'cats': {cat: text[cat] for cat in cats}})
    
for i in range(0,len(df_train)):
    text = df_train.iloc[i]
    lst_train_prepared.append(format_text_spacy(text))


# lst_train_prepared[0:3]


import random
import spacy
import time
from spacy.util import minibatch, compounding


time_start = time.time()


# nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en')

# creat textcat
textcat = nlp.create_pipe('textcat')

# add this to pipe
nlp.add_pipe(textcat, last=True)

# add label
for cat in cats:
    textcat.add_label(cat)

# other pipes
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']


# train only textcat
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(1):
        losses = {}
        batches = minibatch(lst_train_prepared[0:10000], size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)

time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))

Time taken: 1 min 43 secs


from spacy import displacy


doc = nlp(mytext)


displacy.render(doc, style='ent', jupyter=True)


for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

Explanation NP Explanation
the edits NP edits
my username Hardcore Metallica Fan NP Fan
They NP They
vandalisms NP vandalisms
some GAs NP GAs
I NP I
New York Dolls FAC NP FAC
the template NP template
the talk page NP page
I NP I
retired now.89.205.38.27 NP now.89.205.38.27


# for token in doc:
#     print("{0}/{1} <--{2}-- {3}/{4}".format(
#         token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))


from spacy import displacy
 
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})


import textacy


pattern = r'(<VERB>?<ADV>*<VERB>+)'
doc = textacy.make_spacy_doc(mytext,lang='en_core_web_sm')

verb_phrases = textacy.extract.pos_regex_matches(doc, pattern)

# Print all Verb Phrase
for chunk in verb_phrases:
    print(chunk.text)


# Extract Noun Phrase to explain what nouns are involved
for chunk in doc.noun_chunks:
    print (chunk)

made
were reverted
were
voted
don't remove
'm retired
Explanation
the edits
my username Hardcore Metallica Fan
They
vandalisms
some GAs
I
New York Dolls FAC
the template
the talk page
I
retired now.89.205.38.27

/usr/local/lib/python3.6/dist-packages/textacy/extract.py:327: DeprecationWarning: `pos_regex_matches()` has been deprecated! for similar but more powerful and performant functionality, use `textacy.extract.matches()` instead.
  action="once",


from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def extract_full_name(nlp_doc):
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('FULL_NAME', None, pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

extract_full_name(doc)

'Hardcore Metallica'


nlp = spacy.load("en_core_web_sm")


doc1 = nlp("I like cats")
doc2 = nlp("I like dogs")
# Compare 2 documents
doc1.similarity(doc2)
# Compare 2 tokens
doc1[2].similarity(doc2[2])
# Compare tokens and spans
doc1[0].similarity(doc2[1:3])

/usr/lib/python3.6/runpy.py:193: ModelsWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.
  "__main__", mod_spec)
/usr/lib/python3.6/runpy.py:193: ModelsWarning: [W007] The model you're using has no word vectors loaded, so the result of the Token.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.
  "__main__", mod_spec)
/usr/lib/python3.6/runpy.py:193: ModelsWarning: [W007] The model you're using has no word vectors loaded, so the result of the Token.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.
  "__main__", mod_spec)

0.05594692


# Vector as a numpy array
doc = nlp("I like cats")
# The L2 norm of the token's vector
doc[2].vector
doc[2].vector_norm

23.784721


nlp.pipe_names
# ['tagger', 'parser', 'ner']
nlp.pipeline
# [('tagger', <spacy.pipeline.Tagger>),
# ('parser', <spacy.pipeline.DependencyParser>),
# ('ner', <spacy.pipeline.EntityRecognizer>)]

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fc9e8770e80>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fc9e88502e8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fc9e8850348>)]

	id	comment_text
0	0000997932d777bf	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)
2	000113f07ec002fd	Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
3	0001b41b1c6bb37e	"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents"" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess t...
4	0001d958c54c6e35	You, sir, are my hero. Any chance you remember what page that's on?

	id	comment_text
0	00001cee341fdb12	Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,
1	0000247867823ef7	== From RfC == \n\n The title is fine as it is, IMO.
2	00013b17ad220c46	" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / "
3	00017563c3f7919a	:If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.
4	00017695ad8997eb	I don't anonymously edit articles at all.

Table of Contents

Introduction¶

Imports¶

Useful Scripts¶

Load the Data¶

Class distribution¶

Text Preparation for Spacy¶

Classifying text into categories using Spacy¶

Named Entity Recognition¶

Chunking¶

Dependency Parsing¶

Verb Phrase Detection¶

Rule-Based Matching Using spaCy¶

Word vectors and similarity¶

Pipeline components¶

	id	comment_text	toxic	severe_toxic	obscene	threat	insult	identity_hate	clean
0	0000997932d777bf	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27	0	0	0	0	0	0	True
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)	0	0	0	0	0	0	True