In this project, we use the data from kaggle competition Toxic Comment Classification Challenge by Jigsaw and only use the training data. Then we have break this raw training data into train and test data and evaluate the model performances in test data.
The dataset is taken from wikipedia edit text and is classified as one of the following:
This is a multi-label (not-multiclass) classification. One text row has six labels and exactly one label is 1 and other labels are 0.
import os
import sys
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install -U sklearn
!pip install watermark
!pip install tqdm
!pip install scikit-plot
import numpy as np
import pandas as pd
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# modelling
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
# random state
SEED=100
np.random.seed(SEED)
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-11-30 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit sklearn 0.23.2 matplotlib 3.2.1 watermark 2.0.2 numpy 1.19.4 seaborn 0.10.1 pandas 1.1.1
# data
dat_dir = os.path.join('..','data')
path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'
if ENV_COLAB:
dat_dir = os.path.join('..','data')
r = '?raw=true'
path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'
df_train = pd.read_csv(path_data_train,compression=compression)
print(df_train.shape)
print(df_train.columns)
display(df_train.head(2).append(df_train.tail(2)))
(127656, 8) Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], dtype='object')
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | |
---|---|---|---|---|---|---|---|---|
0 | 8d603d50affa1126 | "\nYes, aside, thank you for trying to answer ... | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 8fb3576937b9e0d0 | March 2010 (UTC)\n\nThanks! and understood abo... | 0 | 0 | 0 | 0 | 0 | 0 |
127654 | 95df37d4a69b607d | I am assuming that there is no point trying to... | 0 | 0 | 0 | 0 | 0 | 0 |
127655 | 668ba87c1b6a3f31 | "\nPlus, take a look! Have I made any outing ... | 0 | 0 | 0 | 0 | 0 | 0 |
Reference: https://www.kaggle.com/loganathanspr/toxic-comments-insight-into-datasets
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
# Plotly definitions
# ------------------
# Plot background color
paper_bgcolor = "rgb(240, 240, 240)"
plot_bgcolor = "rgb(240, 240, 240)"
# Red, blue, green (used by plotly by default)
rgb_def = ['rgb(228,26,28)', 'rgb(77,175,74)', 'rgb(55,126,184)']
# Contrasting 2 qualities, highlighting one
contra_2_cols = ["rgb(150,150,150)", "rgb(55,126,184)"]
# Barchart axis templates
# template 1
bchart_xaxis_temp1 = dict(
zeroline=False,
showline=False,
showgrid=False,
showticklabels=False,
tickfont=dict(
size=9,
color="grey"
)
)
bchart_yaxis_temp1=dict(
tickfont=dict(
size=9,
color="grey"
)
)
# template 2
bchart_xaxis_temp2 = dict(
zeroline=False,
showline=False,
showgrid=False,
showticklabels=False,
tickfont=dict(
size=10,
color="grey"
)
)
bchart_yaxis_temp2=dict(
tickfont=dict(
size=10,
color="grey"
)
)
# Heatmap templates
heatmap_axis_temp1 = dict(
zeroline=False,
showline=False,
showgrid=False,
showticklabels=False,
ticks=''
)
comment_types = list(df_train.columns[2:])
cmnt_count_matrix = []
for cmnt_type1 in comment_types:
cmnt_type_frame = df_train[df_train[cmnt_type1] == 1]
cmnt_type2_count = []
for cmnt_type2 in comment_types:
cmnt_type2_count.append(cmnt_type_frame[cmnt_type2].sum())
cmnt_count_matrix.append(cmnt_type2_count)
cmnt_count_matrix = np.array(cmnt_count_matrix)
cmnt_count_matrix
array([[12202, 1282, 6368, 353, 5865, 1056], [ 1282, 1282, 1216, 87, 1105, 252], [ 6368, 1216, 6782, 235, 4943, 830], [ 353, 87, 235, 379, 240, 78], [ 5865, 1105, 4943, 240, 6292, 943], [ 1056, 252, 830, 78, 943, 1136]])
fig_coords = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
axes_names = [("x1", "y1"), ("x2", "y2"), ("x3", "y3"), ("x4", "y4"), ("x5", "y5"), ("x6", "y6")]
axes_lo_names = [("xaxis1", "yaxis1"), ("xaxis2", "yaxis2"), ("xaxis3", "yaxis3"), ("xaxis4", "yaxis4"), ("xaxis5", "yaxis5"), ("xaxis6", "yaxis6")]
fig = tools.make_subplots(
rows=2,
cols=3,
horizontal_spacing=0.15,
vertical_spacing=0.25,
subplot_titles=(comment_types[0], comment_types[1], comment_types[2], comment_types[3], comment_types[4], comment_types[5])
)
for i, c_type, fig_coord, ax in zip(range(len(comment_types)),comment_types, fig_coords, axes_names):
inner_count = pd.Series(cmnt_count_matrix[i, :], index=comment_types)
inner_count = inner_count.sort_values()
trace = go.Bar(x=inner_count, y=list(inner_count.index), orientation = 'h')
fig.append_trace(trace, fig_coord[0], fig_coord[1])
fig["layout"].update(
showlegend=False,
title="<b>Co-occurrence of comment types</b>",
xaxis1=bchart_xaxis_temp2,
yaxis1=bchart_yaxis_temp2,
xaxis2=bchart_xaxis_temp2,
yaxis2=bchart_yaxis_temp2,
xaxis3=bchart_xaxis_temp2,
yaxis3=bchart_yaxis_temp2,
xaxis4=bchart_xaxis_temp2,
yaxis4=bchart_yaxis_temp2,
xaxis5=bchart_xaxis_temp2,
yaxis5=bchart_yaxis_temp2,
xaxis6=bchart_xaxis_temp2,
yaxis6=bchart_yaxis_temp2,
margin=go.Margin(
l=100,
r=100,
t=100,
b=25,
),
autosize=False,
width=900,
height=500,
)
iplot(fig)
# As a heatmap
fig = ff.create_annotated_heatmap(
z=cmnt_count_matrix,
x=comment_types,
y=comment_types,
colorscale='YlGnBu',
zmin=1,
zmax=cmnt_count_matrix.max()
)
fig["layout"]["xaxis"].update(side="bottom")
fig["layout"].update(
title="<b>Co-occurrence of comment types</b>",
xaxis=dict(
title="Major comment category",
tickfont=dict(
color="grey"
)
),
yaxis=dict(
title="Co-occurring comment category",
tickfont=dict(
color="grey"
)
),
margin=go.Margin(
l=150,
r=150,
t=150,
b=75
),
autosize=False,
width=900,
height=450,
)
iplot(fig)
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/tools.py:465: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead /Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning: plotly.graph_objs.Margin is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.layout.Margin
Now comes the meaty part. What kind of vocabulary is used in different types of comments? We are especially interested in bad comments in general. Let's find top 30 words for each comment type from the training data. The way we are going to look at is by taking the TF-IDF of the training data set and find most important words for each comment category.
stop_words_new = list(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS.union(stopwords.words("english")))
count_vect = CountVectorizer(min_df=2, stop_words=stop_words_new)
train_counts = count_vect.fit_transform(df_train["comment_text"])
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
features_array = np.array(count_vect.get_feature_names())
fig_coords = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
axes_names = [("x1", "y1"), ("x2", "y2"), ("x3", "y3"), ("x4", "y4"), ("x5", "y5"), ("x6", "y6")]
axes_lo_names = [("xaxis1", "yaxis1"), ("xaxis2", "yaxis2"), ("xaxis3", "yaxis3"), ("xaxis4", "yaxis4"), ("xaxis5", "yaxis5"), ("xaxis6", "yaxis6")]
fig = tools.make_subplots(
rows=2,
cols=3,
horizontal_spacing=0.01,
vertical_spacing=0.05,
subplot_titles=(comment_types[0], comment_types[1], comment_types[2], comment_types[3], comment_types[4], comment_types[5])
)
num_top_words = 30
for i, cmnt_type, fig_coord, ax in zip(range(len(comment_types)),comment_types, fig_coords, axes_names):
instances_of_cmnt_type_ind = list(df_train[df_train[cmnt_type] == 1].index)
tfidf_cmnt_type = train_tfidf[instances_of_cmnt_type_ind].toarray()
mean_tfidf_cmnt_type = tfidf_cmnt_type.mean(axis=0)
top_words_vals = np.sort(mean_tfidf_cmnt_type)[::-1][0:num_top_words]
top_words_ind = mean_tfidf_cmnt_type.argsort()[::-1][0:num_top_words]
top_words = features_array[top_words_ind]
trace = go.Bar(
x=top_words_vals[::-1],
y=top_words[::-1],
orientation = 'h',
name=cmnt_type
)
fig.append_trace(trace, fig_coord[0], fig_coord[1])
fig["layout"].update(
showlegend=False,
title="<b>Top 30 words for each comment type</b>",
xaxis1=bchart_xaxis_temp1,
yaxis1=bchart_yaxis_temp1,
xaxis2=bchart_xaxis_temp1,
yaxis2=bchart_yaxis_temp1,
xaxis3=bchart_xaxis_temp1,
yaxis3=bchart_yaxis_temp1,
xaxis4=bchart_xaxis_temp1,
yaxis4=bchart_yaxis_temp1,
xaxis5=bchart_xaxis_temp1,
yaxis5=bchart_yaxis_temp1,
xaxis6=bchart_xaxis_temp1,
yaxis6=bchart_yaxis_temp1,
margin=go.Margin(
l=75,
r=75,
t=100,
b=100,
),
autosize=False,
width=900,
height=900,
)
iplot(fig)
/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/tools.py:465: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead /Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning: plotly.graph_objs.Margin is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.layout.Margin