You are provided with a large number of Wikipedia comments which have been labeled by human raters for toxic behavior. The types of toxicity are:
toxic
severe_toxic
obscene
threat
insult
identity_hate
You must create a model which predicts a probability of each type of toxicity for each comment.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
SEED=100
np.random.seed(SEED)
# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 500)
import IPython
from IPython.display import display, HTML, Image, Markdown
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.17.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.2')]
import re
import string # string.punctuation
import string
from string import digits
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date!
ENV_BHISHAN = None
try:
import bhishan
ENV_BHISHAN = True
print("Environment: Bhishan's Laptop")
except:
pass
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# load google drive
# from google.colab import drive
# drive.mount('/content/drive')
# dat_dir = 'drive/My Drive/Colab Notebooks/data/'
# sys.path.append(dat_dir)
# pip install
#!pip install pyldavis
#!pip install hyperopt
#!pip install catboost
!pip install lime
# upgrade tqdm and restart to use df.progess_apply()
# !pip install -U tqdm
# print
print('Environment: Google Colaboratory.')
Collecting lime Downloading https://files.pythonhosted.org/packages/e5/72/4be533df5151fcb48942515e95e88281ec439396c48d67d3ae41f27586f0/lime-0.1.1.36.tar.gz (275kB) |████████████████████████████████| 276kB 8.8MB/s Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from lime) (1.17.4) Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from lime) (1.3.3) Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.6/dist-packages (from lime) (0.21.3) Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from lime) (3.1.2) Requirement already satisfied: scikit-image>=0.12 in /usr/local/lib/python3.6/dist-packages (from lime) (0.15.0) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.18->lime) (0.14.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->lime) (2.4.5) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->lime) (0.10.0) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->lime) (2.6.1) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->lime) (1.1.0) Requirement already satisfied: imageio>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.12->lime) (2.4.1) Requirement already satisfied: PyWavelets>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.12->lime) (1.1.1) Requirement already satisfied: networkx>=2.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.12->lime) (2.4) Requirement already satisfied: pillow>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.12->lime) (4.3.0) Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib->lime) (1.12.0) Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib->lime) (42.0.1) Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx>=2.0->scikit-image>=0.12->lime) (4.4.1) Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from pillow>=4.3.0->scikit-image>=0.12->lime) (0.46) Building wheels for collected packages: lime Building wheel for lime (setup.py) ... done Created wheel for lime: filename=lime-0.1.1.36-cp36-none-any.whl size=284191 sha256=4c30f9573311d20acf614a69ea6ba74996e1c14d200ff0e4baf405311f8f586b Stored in directory: /root/.cache/pip/wheels/a9/2f/25/4b2127822af5761dab9a27be52e175105772aebbcbc484fb95 Successfully built lime Installing collected packages: lime Successfully installed lime-0.1.1.36 Environment: Google Colaboratory.
import scipy
# pipeline
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, ClassifierMixin
# text features
from sklearn.feature_extraction.text import TfidfVectorizer
# classifiers
from sklearn.linear_model import LogisticRegression
# model selection
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
def show_method_attributes(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
ifile1_train = '../data/raw/train.csv'
ifile2_train = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/train.csv?raw=true'
ifile1_test = '../data/raw/test.csv'
ifile2_test = 'https://github.com/bhishanpdl/Project_Toxic_Comments/blob/master/data/raw/test.csv?raw=true'
if ENV_BHISHAN:
df_raw_train = pd.read_csv(ifile1_train)
df_raw_test = pd.read_csv(ifile1_test)
if ENV_COLAB:
df_raw_train = pd.read_csv(ifile2_train)
df_raw_test = pd.read_csv(ifile2_test)
pass
df_train = df_raw_train.copy()
df_test = df_raw_test.copy()
print(df_train.shape)
df_train.head()
(159571, 8)
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | |
---|---|---|---|---|---|---|---|---|
0 | 0000997932d777bf | Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 000103f0d9cfb60f | D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC) | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 000113f07ec002fd | Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info. | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0001b41b1c6bb37e | "\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents"" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess t... | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0001d958c54c6e35 | You, sir, are my hero. Any chance you remember what page that's on? | 0 | 0 | 0 | 0 | 0 | 0 |
print(df_test.shape)
df_test.head()
(153164, 2)
id | comment_text | |
---|---|---|
0 | 00001cee341fdb12 | Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time., |
1 | 0000247867823ef7 | == From RfC == \n\n The title is fine as it is, IMO. |
2 | 00013b17ad220c46 | " \n\n == Sources == \n\n * Zawe Ashton on Lapland — / " |
3 | 00017563c3f7919a | :If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message. |
4 | 00017695ad8997eb | I don't anonymously edit articles at all. |
df_train['clean'] = df_train.loc[:,'toxic':'identity_hate'].sum(axis=1) == 0
print(df_train['clean'].sum())
df_train.head(2)
143346
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | clean | |
---|---|---|---|---|---|---|---|---|---|
0 | 0000997932d777bf | Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 | 0 | 0 | 0 | 0 | 0 | 0 | True |
1 | 000103f0d9cfb60f | D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC) | 0 | 0 | 0 | 0 | 0 | 0 | True |
ser_sum = df_train.iloc[:,2:].sum(axis=0)
ser_sum
toxic 15294.0 severe_toxic 1595.0 obscene 8449.0 threat 478.0 insult 7877.0 identity_hate 1405.0 clean 143346.0 dtype: float64
ax = ser_sum.sort_values().plot.bar(color=sns.color_palette('husl',len(ser_sum)))
for p in ax.patches:
x,y = p.get_x(), p.get_height()
ax.text(x,y,f'{y:,.0f}',fontsize=14,color='blue')
Xtrain = df_train.loc[:5000, 'comment_text'].to_numpy()
ytrain = df_train.loc[:5000, 'toxic'].to_numpy()
ytrain[:5]
array([0, 0, 0, 0, 0])
# default params
scoring='roc_auc'
cv=3
n_jobs=-1
max_features = 2500
tfidf = TfidfVectorizer(max_features=max_features)
lr = LogisticRegression(solver='lbfgs')
p = Pipeline([
('tfidf', tfidf),
('lr', lr)
])
cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)
0.9227
import scipy
class NBFeaturer(BaseEstimator):
def __init__(self, alpha):
self.alpha = alpha
def preprocess_x(self, x, r):
return x.multiply(r)
def pr(self, x, y_i, y):
p = x[y==y_i].sum(0)
return (p+self.alpha) / ((y==y_i).sum()+self.alpha)
def fit(self, x, y=None):
self._r = scipy.sparse.csr_matrix(np.log(self.pr(x,1,y) / self.pr(x,0,y)))
return self
def transform(self, x):
x_nb = self.preprocess_x(x, self._r)
return x_nb
tfidf = TfidfVectorizer(max_features=max_features)
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
('tfidf', tfidf),
('nb', nb),
('lr', lr)
])
cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)
0.9246
class Lemmatizer(BaseEstimator):
def __init__(self):
self.l = WordNetLemmatizer()
def fit(self, x, y=None):
return self
def transform(self, x):
x = map(lambda r: ' '.join([self.l.lemmatize(i.lower()) for i in r.split()]), x)
x = np.array(list(x))
return x
lm = Lemmatizer()
tfidf = TfidfVectorizer(max_features=max_features)
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
('lm', lm),
('tfidf', tfidf),
('nb', nb),
('lr', lr)
])
cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)
0.9258
max_features = 2500
lm = Lemmatizer()
tfidf_w = TfidfVectorizer(max_features=max_features, analyzer='word')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char')
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
('lm', lm),
('wc_tfidfs',
FeatureUnion([
('tfidf_w', tfidf_w),
('tfidf_c', tfidf_c),
])
),
('nb', nb),
('lr', lr)
])
cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)
0.9369
param_grid = [{
'wc_tfidfs__tfidf_w__max_features': [2500],
'wc_tfidfs__tfidf_c__stop_words': [2500, 5000],
'lr__C': [0.01, 0.1, 1.0, 2.0, 3.],
}]
grid = GridSearchCV(p, cv=cv, n_jobs=n_jobs, param_grid=param_grid, scoring=scoring,
return_train_score=False, verbose=0)
grid.fit(Xtrain, ytrain)
grid.best_params_
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
{'lr__C': 0.1, 'wc_tfidfs__tfidf_c__stop_words': 2500, 'wc_tfidfs__tfidf_w__max_features': 2500}
param_grid = [{
'wc_tfidfs__tfidf_w__max_features': [2500, 5000, 10000],
'wc_tfidfs__tfidf_c__stop_words': [2500, 5000, 10000],
'lr__C': [1., 3., 4.],
}]
grid = RandomizedSearchCV(p, cv=cv, n_jobs=n_jobs, param_distributions=param_grid[0], n_iter=5,
scoring=scoring, return_train_score=False, verbose=1)
grid.fit(Xtrain, ytrain)
grid.best_params_
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers. [Parallel(n_jobs=-1)]: Done 15 out of 15 | elapsed: 32.8s finished /usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
{'lr__C': 1.0, 'wc_tfidfs__tfidf_c__stop_words': 5000, 'wc_tfidfs__tfidf_w__max_features': 10000}
max_features = 2500
lm = Lemmatizer()
tfidf_w = TfidfVectorizer(max_features=max_features, analyzer='word')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char',stop_words=2500)
lr = LogisticRegression(C=1.0)
nb = NBFeaturer(1)
p = Pipeline([
('lm', lm),
('wc_tfidfs',
FeatureUnion([
('tfidf_w', tfidf_w),
('tfidf_c', tfidf_c),
])
),
('nb', nb),
('lr', lr)
])
cvs = cross_val_score(estimator=p, X=Xtrain, y=ytrain, scoring=scoring, cv=cv, n_jobs=n_jobs)
cvs.mean().round(4)
0.9369
df_train.shape, df_test.shape
((159571, 9), (153164, 2))
Xtest = df_train.loc[5000:10_000, 'comment_text'].to_numpy()
ytest = df_train.loc[5000:10_000, 'toxic'].to_numpy()
max_features = 2500
lm = Lemmatizer()
tfidf_w = TfidfVectorizer(max_features=max_features, analyzer='word')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char',stop_words=2500)
lr = LogisticRegression(C=3.0,solver='lbfgs')
nb = NBFeaturer(1)
p = Pipeline([
('lm', lm),
('wc_tfidfs',
FeatureUnion([
('tfidf_w', tfidf_w),
('tfidf_c', tfidf_c),
])
),
('nb', nb),
('lr', lr)
])
p.fit(Xtrain,ytrain);
# show_method_attributes(p)
ypreds = p.predict(Xtest)
ytest[0], ypreds[0]
(1, 1)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy: ', accuracy_score(ytest,ypreds))
print('Precision: ', precision_score(ytest,ypreds))
print('Recall: ', recall_score(ytest,ypreds))
print('F1-score: ', f1_score(ytest,ypreds))
Accuracy: 0.9516096780643871 Precision: 0.9154411764705882 Recall: 0.532051282051282 F1-score: 0.672972972972973
from sklearn.metrics import classification_report
print(classification_report(ytest, ypreds))
precision recall f1-score support 0 0.95 0.99 0.97 4533 1 0.92 0.53 0.67 468 accuracy 0.95 5001 macro avg 0.93 0.76 0.82 5001 weighted avg 0.95 0.95 0.95 5001
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, ypreds)
array([[4510, 23], [ 219, 249]])
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
def plot_roc_curve(clf_names, all_roc_auc_scores, y,ofile=None):
"""Plot Receiver Operating Characteristic (ROC) curve.
NOTE:
In cross-validation we do not need Xtrain and Xtest,
one split of X is taken as Xtest and remaining splits are taken as Xtrain.
Example:
---------
clf_names = ["Logisitic Regression","Support Vector Classifier",
"KNN", "Decision Tree Classifier","Random Forest Classifier"]
cvpred_lr = cross_val_predict(clf_best_lr, X, y, cv=5,
method="decision_function")
cvpred_svc = cross_val_predict(clf_best_svc, X, y, cv=5,
method="decision_function")
cvpred_knn = cross_val_predict(clf_best_knn,X,y,cv=5)
cvpred_dtc = cross_val_predict(clf_best_dtc,X,y,cv=5)
cvpred_rfc = cross_val_predict(clf_best_rfc, X, y, cv=5)
all_cross_val_preds = [cvpred_lr, cvpred_svc,
cvpred_knn, cvpred_dtc, cvpred_rfc]
all_roc_auc_scores = [roc_auc_score(y, cvpred)
for cvpred in all_cross_val_preds ]
plot_roc_curve(clf_names, all_roc_auc_scores,y)
"""
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
labels = ['{}: {:.4f}'.format(clf,pred)
for clf,pred in zip(clf_names,all_roc_auc_scores)]
fpr_tpr_threshold = [ roc_curve(y, pred_x)
for pred_x in all_cross_val_preds ]
plt.figure(figsize=(12,8))
for i in range(len(fpr_tpr_threshold)):
plt.plot(fpr_tpr_threshold[i][0],
fpr_tpr_threshold[i][1],
label=labels[i])
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([-0.01, 1, 0, 1])
plt.xlabel('False Positive Rate (FPR)', fontsize=16)
plt.ylabel('True Positive Rate (TPR)', fontsize=16)
plt.annotate('Minimum ROC Score of 50%',
xy=(0.5, 0.5), xytext=(0.6, 0.3),
arrowprops=dict(facecolor='#6E726D', shrink=0.05),
)
plt.title('ROC Curves',fontsize=20)
plt.legend()
plt.tight_layout()
# save the figure
if ofile:
plt.savefig(ofile,dpi=300)
# show the figure
plt.show()
plt.close()
clf_names = ["Logisitic Regression"]
cvpred = cross_val_predict(p, Xtrain, ytrain, cv=5,
method="decision_function")
all_cross_val_preds = [cvpred]
all_roc_auc_scores = [roc_auc_score(ytrain, cvpred)
for cvpred in all_cross_val_preds ]
plot_roc_curve(clf_names, all_roc_auc_scores,ytrain)
from sklearn.pipeline import make_pipeline
from lime import lime_text
from lime.lime_text import LimeTextExplainer
p.fit(Xtrain,ytrain);
class_names = ['Non-Toxic', 'Toxic']
explainer = LimeTextExplainer(class_names=class_names)
idx = 0
exp = explainer.explain_instance(Xtrain[idx], p.predict_proba, top_labels=2)
exp.show_in_notebook(text=Xtrain[idx], labels=(0,1))
/usr/local/lib/python3.6/dist-packages/lime/lime_text.py:116: FutureWarning: split() requires a non-empty pattern match. self.as_list = [s for s in splitter.split(self.raw) if s]
df_train.head(1)
id | comment_text | toxic | severe_toxic | obscene | threat | insult | identity_hate | clean | |
---|---|---|---|---|---|---|---|---|---|
0 | 0000997932d777bf | Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 | 0 | 0 | 0 | 0 | 0 | 0 | True |