import os
import sys
import time

time_start_notebook = time.time()


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install -U sklearn
    !pip install watermark
    !pip install tqdm
    !pip install scikit-plot


import numpy as np
import pandas as pd

# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from nltk.corpus import stopwords


# random state
SEED=100
np.random.seed(SEED)


# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2020-11-30 

CPython 3.7.7
IPython 7.19.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

sklearn    0.23.2
matplotlib 3.2.1
watermark  2.0.2
numpy      1.19.4
seaborn    0.10.1
pandas     1.1.1


# data
dat_dir = os.path.join('..','data')

path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
compression = 'zip'


if ENV_COLAB:
    dat_dir = os.path.join('..','data')
    r = '?raw=true'

    path_data_raw = os.path.join(dat_dir, 'raw', 'jigsaw_toxic.csv.zip')
    path_data_train = os.path.join(dat_dir, 'raw', 'train.csv.zip')
    path_data_test = os.path.join(dat_dir, 'raw', 'test.csv.zip')
    path_data_sample = os.path.join(dat_dir, 'raw', 'sample.csv')
    compression = 'zip'


df_train = pd.read_csv(path_data_train,compression=compression)
print(df_train.shape)
print(df_train.columns)

display(df_train.head(2).append(df_train.tail(2)))

(127656, 8)
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')


from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff

init_notebook_mode(connected=True)



# Plotly definitions 
# ------------------

# Plot background color
paper_bgcolor = "rgb(240, 240, 240)"
plot_bgcolor = "rgb(240, 240, 240)"

# Red, blue, green (used by plotly by default)
rgb_def = ['rgb(228,26,28)', 'rgb(77,175,74)', 'rgb(55,126,184)']

# Contrasting 2 qualities, highlighting one
contra_2_cols = ["rgb(150,150,150)", "rgb(55,126,184)"]

# Barchart axis templates
# template 1
bchart_xaxis_temp1 = dict(
    zeroline=False,
    showline=False, 
    showgrid=False, 
    showticklabels=False,    
    tickfont=dict(
        size=9,
        color="grey"
    )      
)

bchart_yaxis_temp1=dict(
    tickfont=dict(
        size=9,
        color="grey"
    )        
)

# template 2
bchart_xaxis_temp2 = dict(
    zeroline=False,
    showline=False, 
    showgrid=False, 
    showticklabels=False,    
    tickfont=dict(
        size=10,
        color="grey"
    )      
)

bchart_yaxis_temp2=dict(
    tickfont=dict(
        size=10,
        color="grey"
    )        
)

# Heatmap templates
heatmap_axis_temp1 = dict(
    zeroline=False,
    showline=False,
    showgrid=False, 
    showticklabels=False,  
    ticks=''                
)


comment_types = list(df_train.columns[2:])

cmnt_count_matrix = []
for cmnt_type1 in comment_types:
    cmnt_type_frame = df_train[df_train[cmnt_type1] == 1]
    cmnt_type2_count = []
    for cmnt_type2 in comment_types:
        cmnt_type2_count.append(cmnt_type_frame[cmnt_type2].sum())
    cmnt_count_matrix.append(cmnt_type2_count)
cmnt_count_matrix = np.array(cmnt_count_matrix)


cmnt_count_matrix

array([[12202,  1282,  6368,   353,  5865,  1056],
       [ 1282,  1282,  1216,    87,  1105,   252],
       [ 6368,  1216,  6782,   235,  4943,   830],
       [  353,    87,   235,   379,   240,    78],
       [ 5865,  1105,  4943,   240,  6292,   943],
       [ 1056,   252,   830,    78,   943,  1136]])


fig_coords = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
axes_names = [("x1", "y1"), ("x2", "y2"), ("x3", "y3"), ("x4", "y4"), ("x5", "y5"), ("x6", "y6")]
axes_lo_names = [("xaxis1", "yaxis1"), ("xaxis2", "yaxis2"), ("xaxis3", "yaxis3"), ("xaxis4", "yaxis4"), ("xaxis5", "yaxis5"), ("xaxis6", "yaxis6")]
fig = tools.make_subplots(
    rows=2, 
    cols=3, 
    horizontal_spacing=0.15, 
    vertical_spacing=0.25,
    subplot_titles=(comment_types[0], comment_types[1], comment_types[2], comment_types[3], comment_types[4], comment_types[5])
)
for i, c_type, fig_coord, ax in zip(range(len(comment_types)),comment_types, fig_coords, axes_names):
    inner_count = pd.Series(cmnt_count_matrix[i, :], index=comment_types)
    inner_count = inner_count.sort_values()
    trace = go.Bar(x=inner_count, y=list(inner_count.index), orientation = 'h')
    fig.append_trace(trace, fig_coord[0], fig_coord[1])

fig["layout"].update(
    showlegend=False,
    title="<b>Co-occurrence of comment types</b>",
    xaxis1=bchart_xaxis_temp2,
    yaxis1=bchart_yaxis_temp2,
    xaxis2=bchart_xaxis_temp2,
    yaxis2=bchart_yaxis_temp2,
    xaxis3=bchart_xaxis_temp2,
    yaxis3=bchart_yaxis_temp2,    
    xaxis4=bchart_xaxis_temp2,
    yaxis4=bchart_yaxis_temp2,
    xaxis5=bchart_xaxis_temp2,
    yaxis5=bchart_yaxis_temp2,
    xaxis6=bchart_xaxis_temp2,
    yaxis6=bchart_yaxis_temp2,

    margin=go.Margin(
        l=100,
        r=100,
        t=100,
        b=25,
    ),
    autosize=False,
    width=900,
    height=500,
)
iplot(fig)

# As a heatmap
fig = ff.create_annotated_heatmap(
    z=cmnt_count_matrix, 
    x=comment_types, 
    y=comment_types, 
    colorscale='YlGnBu', 
    zmin=1, 
    zmax=cmnt_count_matrix.max()
)
fig["layout"]["xaxis"].update(side="bottom")
fig["layout"].update(
    title="<b>Co-occurrence of comment types</b>",    
    xaxis=dict(
        title="Major comment category",
        tickfont=dict(
            color="grey"
        )        
    ),   
    yaxis=dict(
        title="Co-occurring comment category",
        tickfont=dict(
            color="grey"
        )        
    ),   
    
    margin=go.Margin(
        l=150,
        r=150,
        t=150,
        b=75
    ),
    autosize=False,
    width=900,
    height=450,
)
iplot(fig)

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/tools.py:465: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning:

plotly.graph_objs.Margin is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Margin


stop_words_new = list(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS.union(stopwords.words("english")))
count_vect = CountVectorizer(min_df=2, stop_words=stop_words_new)
train_counts = count_vect.fit_transform(df_train["comment_text"])
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
features_array = np.array(count_vect.get_feature_names())


fig_coords = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
axes_names = [("x1", "y1"), ("x2", "y2"), ("x3", "y3"), ("x4", "y4"), ("x5", "y5"), ("x6", "y6")]
axes_lo_names = [("xaxis1", "yaxis1"), ("xaxis2", "yaxis2"), ("xaxis3", "yaxis3"), ("xaxis4", "yaxis4"), ("xaxis5", "yaxis5"), ("xaxis6", "yaxis6")]
fig = tools.make_subplots(
    rows=2, 
    cols=3, 
    horizontal_spacing=0.01, 
    vertical_spacing=0.05,
    subplot_titles=(comment_types[0], comment_types[1], comment_types[2], comment_types[3], comment_types[4], comment_types[5])
)

num_top_words = 30

for i, cmnt_type, fig_coord, ax in zip(range(len(comment_types)),comment_types, fig_coords, axes_names):
    instances_of_cmnt_type_ind = list(df_train[df_train[cmnt_type] == 1].index)
    tfidf_cmnt_type = train_tfidf[instances_of_cmnt_type_ind].toarray()
    mean_tfidf_cmnt_type = tfidf_cmnt_type.mean(axis=0)
    top_words_vals = np.sort(mean_tfidf_cmnt_type)[::-1][0:num_top_words]
    top_words_ind = mean_tfidf_cmnt_type.argsort()[::-1][0:num_top_words]
    top_words = features_array[top_words_ind]
    trace = go.Bar(
        x=top_words_vals[::-1], 
        y=top_words[::-1], 
        orientation = 'h',
        name=cmnt_type
    )
    fig.append_trace(trace, fig_coord[0], fig_coord[1])
    
fig["layout"].update(
    showlegend=False,
    title="<b>Top 30 words for each comment type</b>",
    xaxis1=bchart_xaxis_temp1,
    yaxis1=bchart_yaxis_temp1,
    xaxis2=bchart_xaxis_temp1,
    yaxis2=bchart_yaxis_temp1,
    xaxis3=bchart_xaxis_temp1,
    yaxis3=bchart_yaxis_temp1,    
    xaxis4=bchart_xaxis_temp1,
    yaxis4=bchart_yaxis_temp1,
    xaxis5=bchart_xaxis_temp1,
    yaxis5=bchart_yaxis_temp1,
    xaxis6=bchart_xaxis_temp1,
    yaxis6=bchart_yaxis_temp1,
    margin=go.Margin(
        l=75,
        r=75,
        t=100,
        b=100,
    ),
    autosize=False,
    width=900,
    height=900,
)
iplot(fig)

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/tools.py:465: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

/Users/poudel/opt/miniconda3/envs/tf2/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:410: DeprecationWarning:

plotly.graph_objs.Margin is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Margin

	id	comment_text
0	8d603d50affa1126	"\nYes, aside, thank you for trying to answer ...
1	8fb3576937b9e0d0	March 2010 (UTC)\n\nThanks! and understood abo...
127654	95df37d4a69b607d	I am assuming that there is no point trying to...
127655	668ba87c1b6a3f31	"\nPlus, take a look! Have I made any outing ...

Description¶

Load the libraries¶

Parameters¶

Load the Data¶

Multilabel Visualization¶

Top 30 words per comment type¶