import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
SEED=100
np.random.seed(SEED)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 500)

import IPython
from IPython.display import display, HTML, Image, Markdown

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.17.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.2')]


import re
import string # string.punctuation
import string
from string import digits


import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


ENV_BHISHAN = None

try:
    import bhishan
    ENV_BHISHAN = True
    print("Environment: Bhishan's Laptop")
except:
    pass


import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # load google drive
    # from google.colab import drive
    # drive.mount('/content/drive')
    # dat_dir = 'drive/My Drive/Colab Notebooks/data/' 
    # sys.path.append(dat_dir)
    
    # pip install
    #!pip install pyldavis
    #!pip install hyperopt
    #!pip install catboost

    # upgrade tqdm and restart to use df.progess_apply()
    !pip install -U tqdm
    
    # print
    print('Environment: Google Colaboratory.')

Collecting tqdm
  Downloading https://files.pythonhosted.org/packages/7f/32/5144caf0478b1f26bd9d97f510a47336cf4ac0f96c6bc3b5af20d4173920/tqdm-4.40.2-py2.py3-none-any.whl (55kB)
     |████████████████████████████████| 61kB 4.6MB/s 
Installing collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.40.2

Environment: Google Colaboratory.


from tqdm.notebook import trange, tqdm

# I need to upgrade tqdm and restart google colab to use df.progress_apply()


def show_method_attributes(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """

    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')


ifile1 = '../data/raw/train.csv'
ifile2 = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/train.csv?raw=true'

if ENV_BHISHAN:
    df_raw = pd.read_csv(ifile1)


if ENV_COLAB:
    df_raw = pd.read_csv(ifile2)
    pass


df = df_raw.copy()

print(df.isnull().sum().sum())
print(df.shape)
display(df.head(2))

0
(159571, 8)


df['clean'] = df.loc[:,'toxic':'identity_hate'].sum(axis=1) == 0

print(df['clean'].sum())

df.head(2)

143346


ser_sum = df.iloc[:,2:].sum(axis=0)
ser_sum

toxic             15294.0
severe_toxic       1595.0
obscene            8449.0
threat              478.0
insult             7877.0
identity_hate      1405.0
clean            143346.0
dtype: float64


ax = ser_sum.sort_values().plot.bar(color=sns.color_palette('husl',len(ser_sum)))

for p in ax.patches:
    x,y = p.get_x(), p.get_height()
    ax.text(x,y,f'{y:,.0f}',fontsize=14,color='blue')


tqdm.pandas()
ser_comments = df['comment_text'].progress_apply(nltk.word_tokenize)


ser_comments.head()

0                                                                                                                                                                                       [Explanation, Why, the, edits, made, under, my, username, Hardcore, Metallica, Fan, were, reverted, ?, They, were, n't, vandalisms, ,, just, closure, on, some, GAs, after, I, voted, at, New, York, Dolls, FAC, ., And, please, do, n't, remove, the, template, from, the, talk, page, since, I, 'm, retired, now.89.205.38.27]
1                                                                                                                                                                                                                                                                                                                                                                  [D'aww, !, He, matches, this, background, colour, I, 'm, seemingly, stuck, with, ., Thanks, ., (, talk, ), 21:51, ,, January, 11, ,, 2016, (, UTC, )]
2                                                                                                                                                                                                                       [Hey, man, ,, I, 'm, really, not, trying, to, edit, war, ., It, 's, just, that, this, guy, is, constantly, removing, relevant, information, and, talking, to, me, through, edits, instead, of, my, talk, page, ., He, seems, to, care, more, about, the, formatting, than, the, actual, info, .]
3    [``, More, I, ca, n't, make, any, real, suggestions, on, improvement, -, I, wondered, if, the, section, statistics, should, be, later, on, ,, or, a, subsection, of, ``, '', types, of, accidents, '', '', -I, think, the, references, may, need, tidying, so, that, they, are, all, in, the, exact, same, format, ie, date, format, etc, ., I, can, do, that, later, on, ,, if, no-one, else, does, first, -, if, you, have, any, preferences, for, formatting, style, on, references, or, want, to, do, it, yo...
4                                                                                                                                                                                                                                                                                                                                                                                                                            [You, ,, sir, ,, are, my, hero, ., Any, chance, you, remember, what, page, that, 's, on, ?]
Name: comment_text, dtype: object


ser_comments.shape, df.shape

((159571,), (159571, 9))


# ser_comments[0]


df[df['comment_text']==''].shape

(0, 9)


df[df.comment_text.duplicated()].shape

(0, 9)


toxic = df[df.toxic == 1]['comment_text'].to_numpy()
severe_toxic = df[df.severe_toxic == 1]['comment_text'].to_numpy()
obscene = df[df.obscene == 1]['comment_text'].to_numpy()
threat = df[df.threat == 1]['comment_text'].to_numpy()
insult = df[df.insult == 1]['comment_text'].to_numpy()
identity_hate = df[df.identity_hate == 1]['comment_text'].to_numpy()


import wordcloud
from wordcloud import WordCloud, STOPWORDS


def plot_wordcloud(text_arr):
  text_str = ' '.join(text_arr)

  # initialize wordcloud
  wc = WordCloud(width=1600, height=800, background_color="white",
                 max_words=50, stopwords=STOPWORDS)

  # get wordcloud
  wc.generate(text_str)

  plt.imshow(wc, interpolation='bilinear')
  plt.axis("off")
  plt.show()


plot_wordcloud(toxic)


replacement_patterns = [
 (r'won\'t', 'will not'),
 (r'can\'t', 'cannot'),
 (r'i\'m', 'i am'),
 (r'ain\'t', 'is not'),
 (r'(\w+)\'ll', '\g<1> will'),
 (r'(\w+)n\'t', '\g<1> not'),
 (r'(\w+)\'ve', '\g<1> have'),
 (r'(\w+)\'s', '\g<1> is'),
 (r'(\w+)\'re', '\g<1> are'),
 (r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
         self.patterns = \
         [(re.compile(regex), repl) for (regex, repl) in patterns]
     
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
             s = re.sub(pattern, repl, s)
        return s


from tqdm.notebook import trange
import string

lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
tokenizer = nltk.tokenize.TweetTokenizer()

replacer = RegexpReplacer()


def process_text_arr(arr):
    arr_processed = []

    for i in trange(arr.shape[0]):
        # tokenize
        lst = tokenizer.tokenize(replacer.replace(arr[i]))

        # remove stopwords
        lst = [word for word in lst
                     if (word.lower() not in stopwords
                                      and word.lower() 
                                      not in list(string.punctuation))]

        lst = [ lemmatizer.lemmatize(word, 'v') for word in lst]
        arr_processed.extend(list(lst))
    return arr_processed


toxic1 = process_text_arr(toxic)


fd = nltk.FreqDist(word for word in toxic1)


show_method_attributes(fd)

Object Type: <class 'nltk.probability.FreqDist'>


fd.plot(40)


ax = pd.DataFrame(fd.most_common(20)).plot.bar(x=0,y=1,color=sns.color_palette('husl',20))
for p in ax.patches:
    x,y = p.get_x(), p.get_height()
    ax.text(x,y,f'{y:,.0f}',fontsize=14,color='blue',rotation=45)


plt.title('Frequency distribution of Insult words')
plt.xlabel('word')
plt.ylabel('freq')

Text(0, 0.5, 'freq')


plot_wordcloud(toxic1)


from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(strip_accents='unicode',
                     analyzer='word',
                     ngram_range=(1,1),
                     use_idf=True,
                     smooth_idf=True,
                     sublinear_tf=True,
                     stop_words = 'english')


def txt_arr_to_tfidf(txt_arr):
    tvec_weights = tf.fit_transform(txt_arr)
    weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
    df_weights = pd.DataFrame({'term': tf.get_feature_names(), 'weight': weights})
    df_weights1 = df_weights.sort_values(by='weight', ascending=False).head(10)

    return df_weights1


df_toxic_tfidf = txt_arr_to_tfidf(toxic1)
df_toxic_tfidf


df_toxic_tfidf.plot.barh(x='term',y='weight')

<matplotlib.axes._subplots.AxesSubplot at 0x7fa1ce574be0>

	0	1	2	3	4	5	6
0	B	copy	get	max	pop	setdefault	unicode_repr
1	N	elements	hapaxes	most_common	popitem	subtract	update
2	Nr	freq	items	pformat	pprint	tabulate	values
3	clear	fromkeys	keys	plot	r_Nr

	term	weight
11084	fuck	0.026908
25721	suck	0.009767
24072	shit	0.008288
16023	like	0.008268
29315	wikipedia	0.007847
18544	nigger	0.007502
2556	ass	0.006646
10014	faggot	0.006106
12622	hate	0.005971
19623	page	0.005795

Table of Contents

Introduction¶

Imports¶

Useful Scripts¶

Load the Data¶

Class distribution¶

Select only text column¶

Check nans and duplicates¶

Separate the classes¶

Word clouds¶

Treat the apostrophes¶

Frequency Distribution¶

Tf-idf Vectorization¶

	id	comment_text	toxic	severe_toxic	obscene	threat	insult	identity_hate
0	0000997932d777bf	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27	0	0	0	0	0	0
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)	0	0	0	0	0	0

	id	comment_text	toxic	severe_toxic	obscene	threat	insult	identity_hate	clean
0	0000997932d777bf	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27	0	0	0	0	0	0	True
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)	0	0	0	0	0	0	True