import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
SEED=100
np.random.seed(SEED)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 500)

import IPython
from IPython.display import display, HTML, Image, Markdown

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.17.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]


import re
import string # string.punctuation
import string
from string import digits


import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')


from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


ENV_BHISHAN = None

try:
    import bhishan
    ENV_BHISHAN = True
    print("Environment: Bhishan's Laptop")
except:
    pass


import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    # load google drive
    # from google.colab import drive
    # drive.mount('/content/drive')
    # dat_dir = 'drive/My Drive/Colab Notebooks/data/' 
    # sys.path.append(dat_dir)
    
    # pip install
    #!pip install pyldavis
    #!pip install hyperopt
    #!pip install catboost
    
    # print
    print('Environment: Google Colaboratory.')

Environment: Google Colaboratory.


ifile1 = '../data/raw/train.csv'
ifile2 = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/train.csv?raw=true'

if ENV_BHISHAN:
    df_raw = pd.read_csv(ifile1)


if ENV_COLAB:
    df_raw = pd.read_csv(ifile2)
    pass


df = df_raw.copy()

print(df.isnull().sum().sum())
print(df.shape)
display(df.head(2))

0
(159571, 8)


df['clean'] = df.loc[:,'comment_text':'identity_hate'].sum(axis=1) == 0

print(df['clean'].sum())

df.head(2)

143346


df['total_length'] = df['comment_text'].apply(len)

df["mean_word_len"] = df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))


df['capitals'] = df['comment_text'].apply(
    lambda comment: sum(1 for c in comment if c.isupper()))


df['caps_per_length'] = df.apply(
    lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)

df['num_exclamation_marks'] = df['comment_text'].apply(lambda comment: comment.count('!'))

df['num_question_marks'] = df['comment_text'].apply(lambda comment: comment.count('?'))

df['num_punctuation'] = df['comment_text'].apply(
    lambda comment: sum(comment.count(w) for w in '.,;:'))

df['num_symbols'] = df['comment_text'].apply(
    lambda comment: sum(comment.count(w) for w in '*&$%'))

df['num_words'] = df['comment_text'].apply(lambda comment: len(comment.split()))

df['num_sent']=df["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)

df['num_unique_words'] = df['comment_text'].apply(
    lambda comment: len(set(w for w in comment.split())))

df['words_vs_unique'] = df['num_unique_words'] / df['num_words']

df["num_words_title"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

df['num_smilies'] = df['comment_text'].apply(
    lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))


from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words("english"))

df["num_stopwords"] = df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))


# ip address
df['ip'] = df['comment_text'].apply(lambda x: re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", str(x)))
df['num_ip']=df["ip"].apply(lambda x: len(x))


df.head(2).T


# check correlation to find usefulness of these added columns


features = ('total_length', 'capitals', 'caps_vs_length',
            'num_exclamation_marks','num_question_marks', 'num_punctuation',
            'num_words', 'num_unique_words',
            'words_vs_unique', 'num_smilies', 'num_symbols')

columns = ('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')

rows = [{c:df[f].corr(df[c]) for c in columns} for f in features]

df_corr = pd.DataFrame(rows, index=features)


ax = sns.heatmap(df_corr, vmin=-0.2, vmax=0.2, center=0.0,annot=True)


# if the row varies along x-axis the new feature is useful and
# if it's same for all its not useful.
#
# useful features found:
#
# Proportion of capitals
# Number of unique words
# Number of exclamation marks
# Number of punctuations


df_corr.std(axis=1).sort_values(ascending=False)

caps_vs_length           0.061509
capitals                 0.037464
words_vs_unique          0.032472
num_unique_words         0.030480
total_length             0.025462
num_words                0.024337
num_punctuation          0.021978
num_exclamation_marks    0.017933
num_question_marks       0.012020
num_symbols              0.003104
num_smilies              0.001966
dtype: float64


import string

df['comment_text'] = df['comment_text'].str.translate(str.maketrans(' ', ' ', string.punctuation))

df.iloc[1,1]

'Daww He matches this background colour Im seemingly stuck with Thanks  talk 2151 January 11 2016 UTC'


df['comment_text'] = df['comment_text'].str.translate(str.maketrans(' ', ' ', '\n'))
df['comment_text'] = df['comment_text'].str.translate(str.maketrans(' ', ' ', string.digits))

df.iloc[1,1]

'Daww He matches this background colour Im seemingly stuck with Thanks  talk  January   UTC'


import re

df['comment_text'] = df['comment_text'].apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

df.iloc[1,1]

'Daww He matches this background colour Im seemingly stuck with Thanks  talk  January   UTC'


df['comment_text'] = df['comment_text'].str.lower()
df.iloc[1,1]

'daww he matches this background colour im seemingly stuck with thanks  talk  january   utc'


df['comment_text'] = df['comment_text'].str.split()
df.iloc[1,1]

['daww',
 'he',
 'matches',
 'this',
 'background',
 'colour',
 'im',
 'seemingly',
 'stuck',
 'with',
 'thanks',
 'talk',
 'january',
 'utc']


stop = set(stopwords.words('english'))
df['comment_text'] = df['comment_text'].apply(lambda x: [item for item in x if item not in stop])
df.head(2)


import nltk


lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_lst(lst):
    return [lemmatizer.lemmatize(word) for word in lst]



df['comment_text'] = df['comment_text'].apply(lemmatize_lst)

df.head(2)


# remove repeated substring haha ==> ha
df['comment_text'] = df['comment_text'].str.replace(r'(\w+)\1',r'\1',regex=True)


df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'total_length', 'capitals', 'caps_vs_length',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
       'num_smilies'],
      dtype='object')


cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
data1 = [df.query(f"""  {x} == 1""").shape[0] for x in cols ]

ser1 = pd.Series(data1, index=cols)
ser1

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64


sns.barplot(x=ser1.values, y=cols, order=ser1.sort_values().index)

<matplotlib.axes._subplots.AxesSubplot at 0x7f13a0905320>


# Expand apostrophes


map_apos = {
    "you're": 'you are',
    "i'm": 'i am',
    "he's": 'he is',
    "she's": 'she is',
    "it's": 'it is',
    "they're": 'they are',
    "can't": 'can not',
    "couldn't": 'could not',
    "don't": 'do not',
    "don;t": 'do not',
    "didn't": 'did not',
    "doesn't": 'does not',
    "isn't": 'is not',
    "wasn't": 'was not',
    "aren't": 'are not',
    "weren't": 'were not',
    "won't": 'will not',
    "wouldn't": 'would not',
    "hasn't": 'has not',
    "haven't": 'have not',
    "what's": 'what is',
    "that's": 'that is',
}


s = df_raw['comment_text'].head(1)
s

0    Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
Name: comment_text, dtype: object


df_raw['comment_text'].head(1).str.split()

0    [Explanation, Why, the, edits, made, under, my, username, Hardcore, Metallica, Fan, were, reverted?, They, weren't, vandalisms,, just, closure, on, some, GAs, after, I, voted, at, New, York, Dolls, FAC., And, please, don't, remove, the, template, from, the, talk, page, since, I'm, retired, now.89.205.38.27]
Name: comment_text, dtype: object


s.str.split()

0    [Explanation, Why, the, edits, made, under, my, username, Hardcore, Metallica, Fan, were, reverted?, They, weren't, vandalisms,, just, closure, on, some, GAs, after, I, voted, at, New, York, Dolls, FAC., And, please, don't, remove, the, template, from, the, talk, page, since, I'm, retired, now.89.205.38.27]
Name: comment_text, dtype: object


s2 = pd.Series(s.str.split()[0])
s3 = s2.map(map_apos).fillna(s2)
s3 = s3.str.cat(sep=' ')
s3

"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They were not vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please do not remove the template from the talk page since I'm retired now.89.205.38.27"

Table of Contents

Introduction¶

Imports¶

Load the Data¶

Creating text features¶

Basic Text Processing¶

Step 1 - Remove Punctuation¶

Step 2 - Remove Digits¶

Step 3 - Split combined words¶

Step 4 - Convert to lowercase¶

Step 5 - Split each sentence into words¶

Step 6 - Remove Stop Words¶

Step 7 - Convert Word to Base Form or Lematize¶

Visualization¶

	id	comment_text	toxic	severe_toxic	obscene	threat	insult	identity_hate
0	0000997932d777bf	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27	0	0	0	0	0	0
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)	0	0	0	0	0	0

	id	comment_text	toxic	severe_toxic	obscene	threat	insult	identity_hate	clean
0	0000997932d777bf	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27	0	0	0	0	0	0	True
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)	0	0	0	0	0	0	True

	0	1
id	0000997932d777bf	000103f0d9cfb60f
comment_text	Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27	D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)
toxic	0	0
severe_toxic	0	0
obscene	0	0
threat	0	0
insult	0	0
identity_hate	0	0
total_length	264	112
capitals	17	8
caps_vs_length	0.0643939	0.0714286
num_exclamation_marks	0	1
num_question_marks	1	0
num_punctuation	6	5
num_symbols	0	0
num_words	43	17
num_unique_words	41	17
words_vs_unique	0.953488	1
num_smilies	0	0

	id	comment_text	toxic	severe_toxic	obscene	threat	insult	identity_hate	total_length	capitals	caps_vs_length	num_exclamation_marks	num_question_marks	num_punctuation	num_symbols	num_words	num_unique_words	words_vs_unique	num_smilies
0	0000997932d777bf	[explanation, edits, made, username, hardcore, metallica, fan, reverted, werent, vandalisms, closure, gas, voted, new, york, dolls, fac, please, dont, remove, template, talk, page, since, im, retired]	0	0	0	0	0	0	264	17	0.064394	0	1	6	0	43	41	0.953488	0
1	000103f0d9cfb60f	[daww, matches, background, colour, im, seemingly, stuck, thanks, talk, january, utc]	0	0	0	0	0	0	112	8	0.071429	1	0	5	0	17	17	1.000000	0