You are provided with a large number of Wikipedia comments which have been labeled by human raters for toxic behavior. The types of toxicity are:
toxic
severe_toxic
obscene
threat
insult
identity_hate
You must create a model which predicts a probability of each type of toxicity for each comment.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
SEED=100
np.random.seed(SEED)
# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 500)
import IPython
from IPython.display import display, HTML, Image, Markdown
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.17.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
import re
import string # string.punctuation
import string
from string import digits
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip. [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Unzipping corpora/wordnet.zip.
ENV_BHISHAN = None
try:
import bhishan
ENV_BHISHAN = True
print("Environment: Bhishan's Laptop")
except:
pass
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# load google drive
# from google.colab import drive
# drive.mount('/content/drive')
# dat_dir = 'drive/My Drive/Colab Notebooks/data/'
# sys.path.append(dat_dir)
# pip install
#!pip install pyldavis
#!pip install hyperopt
#!pip install catboost
# print
print('Environment: Google Colaboratory.')
Environment: Google Colaboratory.
ifile1 = '../data/raw/train.csv'
ifile2 = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/train.csv?raw=true'
if ENV_BHISHAN:
df_raw = pd.read_csv(ifile1)
if ENV_COLAB:
df_raw = pd.read_csv(ifile2)
pass
df = df_raw.copy()
print(df.isnull().sum().sum())
print(df.shape)
display(df.head(2))
0 (159571, 8)
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | |
---|---|---|---|---|---|---|---|---|
0 | 0000997932d777bf | Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 000103f0d9cfb60f | D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC) | 0 | 0 | 0 | 0 | 0 | 0 |
df['clean'] = df.loc[:,'comment_text':'identity_hate'].sum(axis=1) == 0
print(df['clean'].sum())
df.head(2)
143346
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | clean | |
---|---|---|---|---|---|---|---|---|---|
0 | 0000997932d777bf | Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 | 0 | 0 | 0 | 0 | 0 | 0 | True |
1 | 000103f0d9cfb60f | D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC) | 0 | 0 | 0 | 0 | 0 | 0 | True |
df['total_length'] = df['comment_text'].apply(len)
df["mean_word_len"] = df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df['capitals'] = df['comment_text'].apply(
lambda comment: sum(1 for c in comment if c.isupper()))
df['caps_per_length'] = df.apply(
lambda row: float(row['capitals'])/float(row['total_length']),
axis=1)
df['num_exclamation_marks'] = df['comment_text'].apply(lambda comment: comment.count('!'))
df['num_question_marks'] = df['comment_text'].apply(lambda comment: comment.count('?'))
df['num_punctuation'] = df['comment_text'].apply(
lambda comment: sum(comment.count(w) for w in '.,;:'))
df['num_symbols'] = df['comment_text'].apply(
lambda comment: sum(comment.count(w) for w in '*&$%'))
df['num_words'] = df['comment_text'].apply(lambda comment: len(comment.split()))
df['num_sent']=df["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
df['num_unique_words'] = df['comment_text'].apply(
lambda comment: len(set(w for w in comment.split())))
df['words_vs_unique'] = df['num_unique_words'] / df['num_words']
df["num_words_title"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df['num_smilies'] = df['comment_text'].apply(
lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words("english"))
df["num_stopwords"] = df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
# ip address
df['ip'] = df['comment_text'].apply(lambda x: re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", str(x)))
df['num_ip']=df["ip"].apply(lambda x: len(x))
df.head(2).T
0 | 1 | |
---|---|---|
id | 0000997932d777bf | 000103f0d9cfb60f |
comment_text | Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 | D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC) |
toxic | 0 | 0 |
severe_toxic | 0 | 0 |
obscene | 0 | 0 |
threat | 0 | 0 |
insult | 0 | 0 |
identity_hate | 0 | 0 |
total_length | 264 | 112 |
capitals | 17 | 8 |
caps_vs_length | 0.0643939 | 0.0714286 |
num_exclamation_marks | 0 | 1 |
num_question_marks | 1 | 0 |
num_punctuation | 6 | 5 |
num_symbols | 0 | 0 |
num_words | 43 | 17 |
num_unique_words | 41 | 17 |
words_vs_unique | 0.953488 | 1 |
num_smilies | 0 | 0 |
# check correlation to find usefulness of these added columns
features = ('total_length', 'capitals', 'caps_vs_length',
'num_exclamation_marks','num_question_marks', 'num_punctuation',
'num_words', 'num_unique_words',
'words_vs_unique', 'num_smilies', 'num_symbols')
columns = ('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
rows = [{c:df[f].corr(df[c]) for c in columns} for f in features]
df_corr = pd.DataFrame(rows, index=features)
ax = sns.heatmap(df_corr, vmin=-0.2, vmax=0.2, center=0.0,annot=True)
# if the row varies along x-axis the new feature is useful and
# if it's same for all its not useful.
#
# useful features found:
#
# Proportion of capitals
# Number of unique words
# Number of exclamation marks
# Number of punctuations
df_corr.std(axis=1).sort_values(ascending=False)
caps_vs_length 0.061509 capitals 0.037464 words_vs_unique 0.032472 num_unique_words 0.030480 total_length 0.025462 num_words 0.024337 num_punctuation 0.021978 num_exclamation_marks 0.017933 num_question_marks 0.012020 num_symbols 0.003104 num_smilies 0.001966 dtype: float64
You are annoying!!! goJumpOff4Cliff pleaseeeeeeee
import string
df['comment_text'] = df['comment_text'].str.translate(str.maketrans(' ', ' ', string.punctuation))
df.iloc[1,1]
'Daww He matches this background colour Im seemingly stuck with Thanks talk 2151 January 11 2016 UTC'
df['comment_text'] = df['comment_text'].str.translate(str.maketrans(' ', ' ', '\n'))
df['comment_text'] = df['comment_text'].str.translate(str.maketrans(' ', ' ', string.digits))
df.iloc[1,1]
'Daww He matches this background colour Im seemingly stuck with Thanks talk January UTC'
For instance, converting whyAreYou to why Are You
import re
df['comment_text'] = df['comment_text'].apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))
df.iloc[1,1]
'Daww He matches this background colour Im seemingly stuck with Thanks talk January UTC'
df['comment_text'] = df['comment_text'].str.lower()
df.iloc[1,1]
'daww he matches this background colour im seemingly stuck with thanks talk january utc'
df['comment_text'] = df['comment_text'].str.split()
df.iloc[1,1]
['daww', 'he', 'matches', 'this', 'background', 'colour', 'im', 'seemingly', 'stuck', 'with', 'thanks', 'talk', 'january', 'utc']
stop = set(stopwords.words('english'))
df['comment_text'] = df['comment_text'].apply(lambda x: [item for item in x if item not in stop])
df.head(2)
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | total_length | capitals | caps_vs_length | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_words | num_unique_words | words_vs_unique | num_smilies | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0000997932d777bf | [explanation, edits, made, username, hardcore, metallica, fan, reverted, werent, vandalisms, closure, gas, voted, new, york, dolls, fac, please, dont, remove, template, talk, page, since, im, retired] | 0 | 0 | 0 | 0 | 0 | 0 | 264 | 17 | 0.064394 | 0 | 1 | 6 | 0 | 43 | 41 | 0.953488 | 0 |
1 | 000103f0d9cfb60f | [daww, matches, background, colour, im, seemingly, stuck, thanks, talk, january, utc] | 0 | 0 | 0 | 0 | 0 | 0 | 112 | 8 | 0.071429 | 1 | 0 | 5 | 0 | 17 | 17 | 1.000000 | 0 |
import nltk
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_lst(lst):
return [lemmatizer.lemmatize(word) for word in lst]
df['comment_text'] = df['comment_text'].apply(lemmatize_lst)
df.head(2)
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | total_length | capitals | caps_vs_length | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_words | num_unique_words | words_vs_unique | num_smilies | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0000997932d777bf | [explanation, edits, made, username, hardcore, metallica, fan, reverted, werent, vandalism, closure, gas, voted, new, york, doll, fac, please, dont, remove, template, talk, page, since, im, retired] | 0 | 0 | 0 | 0 | 0 | 0 | 264 | 17 | 0.064394 | 0 | 1 | 6 | 0 | 43 | 41 | 0.953488 | 0 |
1 | 000103f0d9cfb60f | [daww, match, background, colour, im, seemingly, stuck, thanks, talk, january, utc] | 0 | 0 | 0 | 0 | 0 | 0 | 112 | 8 | 0.071429 | 1 | 0 | 5 | 0 | 17 | 17 | 1.000000 | 0 |
# remove repeated substring haha ==> ha
df['comment_text'] = df['comment_text'].str.replace(r'(\w+)\1',r'\1',regex=True)
df.columns
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks', 'num_question_marks', 'num_punctuation', 'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique', 'num_smilies'], dtype='object')
cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
data1 = [df.query(f""" {x} == 1""").shape[0] for x in cols ]
ser1 = pd.Series(data1, index=cols)
ser1
toxic 15294 severe_toxic 1595 obscene 8449 threat 478 insult 7877 identity_hate 1405 dtype: int64
sns.barplot(x=ser1.values, y=cols, order=ser1.sort_values().index)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13a0905320>
# Expand apostrophes
map_apos = {
"you're": 'you are',
"i'm": 'i am',
"he's": 'he is',
"she's": 'she is',
"it's": 'it is',
"they're": 'they are',
"can't": 'can not',
"couldn't": 'could not',
"don't": 'do not',
"don;t": 'do not',
"didn't": 'did not',
"doesn't": 'does not',
"isn't": 'is not',
"wasn't": 'was not',
"aren't": 'are not',
"weren't": 'were not',
"won't": 'will not',
"wouldn't": 'would not',
"hasn't": 'has not',
"haven't": 'have not',
"what's": 'what is',
"that's": 'that is',
}
s = df_raw['comment_text'].head(1)
s
0 Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 Name: comment_text, dtype: object
df_raw['comment_text'].head(1).str.split()
0 [Explanation, Why, the, edits, made, under, my, username, Hardcore, Metallica, Fan, were, reverted?, They, weren't, vandalisms,, just, closure, on, some, GAs, after, I, voted, at, New, York, Dolls, FAC., And, please, don't, remove, the, template, from, the, talk, page, since, I'm, retired, now.89.205.38.27] Name: comment_text, dtype: object
s.str.split()
0 [Explanation, Why, the, edits, made, under, my, username, Hardcore, Metallica, Fan, were, reverted?, They, weren't, vandalisms,, just, closure, on, some, GAs, after, I, voted, at, New, York, Dolls, FAC., And, please, don't, remove, the, template, from, the, talk, page, since, I'm, retired, now.89.205.38.27] Name: comment_text, dtype: object
s2 = pd.Series(s.str.split()[0])
s3 = s2.map(map_apos).fillna(s2)
s3 = s3.str.cat(sep=' ')
s3
"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They were not vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please do not remove the template from the talk page since I'm retired now.89.205.38.27"