import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')


import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import mlxtend
import plotly_express as px

pd.options.plotting.backend = "plotly"
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',1000)

import time,os,json,sys
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print([(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,mlxtend,px]])

#=========Visualization
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)


#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer


print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])

#=======OTHERS
import ast
import scipy
import multiprocessing as mp
import gc
import operator
from collections import defaultdict

#=====Warnigns
import warnings
warnings.simplefilter("ignore")

[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('sklearn', '0.23.1'), ('mlxtend', '0.17.0'), ('plotly_express', '0.4.1')]

[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]


df_combined = pd.read_csv('../data/processed/df_combined_clean.csv')

# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

# we need to make list as list type
df_combined[mcl] = df_combined[mcl].apply(ast.literal_eval)
df_combined[mcle] = df_combined[mcle].apply(ast.literal_eval)

df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]

print(f"shape df_train: {df_train.shape}")
print(f"shape df_test: {df_test.shape}")

df_train.head(2).append(df_train.tail(2))

shape df_train: (7920, 24)
shape df_test: (1953, 24)


from sklearn.model_selection import train_test_split

target = target

# df_train has labels but df_test does not have it.
# break df_train into train and valid.

df_Xtrain_orig = df_train
ser_ytrain_orig = df_train[target]
df_Xtest = df_test
ser_ytest = None # it does not exist

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_train, df_train[target],
    test_size=0.2, random_state=SEED, stratify=df_train[target])

ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()




print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")

print(f"\ndf_Xtrain      : {df_Xtrain.shape}")
print(f"ser_ytrain     : {ser_ytrain.shape}")

print(f"\ndf_Xvalid      : {df_Xvalid.shape}")
print(f"ser_yvalid     : {ser_yvalid.shape}")


df_Xtrain_orig.head(2)

df_Xtrain_orig : (7920, 24)
ser_ytrain_orig: (7920,)

df_Xtrain      : (6336, 24)
ser_ytrain     : (6336,)

df_Xvalid      : (1584, 24)
ser_yvalid     : (1584,)


features = ['total_length',
    'num_words', 'num_sent',
    'num_unique_words', 'num_words_title',
    'num_uppercase','num_exclamation_marks',
    'num_question_marks','num_punctuation',
    'num_symbols', 'num_digits',
    'avg_word_len', 'avg_uppercase','avg_unique']


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict


df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Weighted': [],
    'Time Taken': [],
    'Time Taken Sec': [],
})


import time
from sklearn import metrics

def get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            df_eval=df_eval,
            scaling=False,
            sort_col = 'F1 Weighted',
            N=None,disp=True,only=None):
    
    if scaling:
        scaler = MinMaxScaler(feature_range=(0,1))
        scaler.fit(Xtr)
        Xtr = scaler.transform(Xtr)
        Xvd = scaler.transform(Xvd)

    time_start = time.time()
    model.fit(Xtr,ytr)

    skf = StratifiedKFold(n_splits=3, 
                          random_state=SEED,
                          shuffle=True)
    vd_preds = cross_val_predict(model,Xvd,yvd,cv=skf,n_jobs=-1)

    acc = metrics.accuracy_score(yvd,vd_preds)
    pre = metrics.precision_score(yvd,vd_preds)
    rec = metrics.recall_score(yvd,vd_preds)
    f1 = metrics.f1_score(yvd,vd_preds,average='weighted')
    time_taken_sec = time.time() - time_start
    m,s = divmod(time_taken_sec,60)
    time_taken = f"{s:.2f} sec" if not m else f"{m} min {s:.2f} sec"

    row = [text_model_name, params, model_name,desc]
    row = row + [acc, pre, rec, f1, time_taken, time_taken_sec]

    df_eval.loc[len(df_eval)] = row
    df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])
    df_eval = df_eval.sort_values(sort_col,ascending=False)
    df_eval = df_eval.reset_index(drop=False)
    
    # reorder columns
    df_eval = df_eval[['Text Model', 'Params', 'Model',
                       'Description', 'F1 Weighted',
                       'Time Taken','Accuracy',
                       'Precision','Recall',
                        'Time Taken Sec']]
    
    if only:
        df_eval2 = df_eval[df_eval['Text Model']==only]
        df_eval2 = df_eval.nlargest(1,'F1 Weighted').append(df_eval2)

    if disp:
        if only:
            display(df_eval2.head(N).style.background_gradient(
            subset=[sort_col]))
        else:
            display(df_eval.head(N).style.background_gradient(
            subset=[sort_col]))
        
    return df_eval


from sklearn.feature_extraction.text import CountVectorizer


vec_cv = CountVectorizer()

csr_Xtrain_orig = vec_cv.fit_transform(df_Xtrain_orig[mce])
csr_Xtrain = vec_cv.fit_transform(df_Xtrain[mce])
csr_Xvalid = vec_cv.fit_transform(df_Xvalid[mce])

# current variables
Xtr = csr_Xtrain
Xvd = csr_Xvalid
ytr = ytrain
yvd = yvalid


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'BoW'
params = ''
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            df_eval=df_eval)


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'BoW'
params = 'Scaling' # needs dense array
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr.A,ytr,Xvd.A,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=True,
            df_eval=df_eval)


model = LinearSVC(random_state=SEED, max_iter=200)
text_model_name = 'BoW'
params = 'Scaling' # needs dense array
model_name = 'linear svc'
desc = 'max_iter=200'

df_eval = get_model_evaluation(Xtr.A,ytr,Xvd.A,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=True,
            df_eval=df_eval)


Xtr2 = np.c_[Xtr.A, df_Xtrain[features].to_numpy()]
Xvd2 = np.c_[Xvd.A, df_Xvalid[features].to_numpy()]


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'BoW'
params = 'Extra+Scaling' # needs dense array
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=True,
            df_eval=df_eval)


import functools


@functools.lru_cache()
def get_word_vecs(col):
    nlp = spacy.load('en_core_web_lg')

    f = lambda x: nlp(x).vector.reshape(1, -1)

    Xtr = df_Xtrain[col].apply(f).to_numpy()
    Xvd = df_Xvalid[col].apply(f).to_numpy()
    
    Xtr = np.concatenate(Xtr, axis=0)
    Xvd = np.concatenate(Xvd, axis=0)

    return [Xtr,Xvd]


%%time
Xtr,Xvd = get_word_vecs(mce)

# Wall time: 1min 3s

CPU times: user 54 s, sys: 1.17 s, total: 55.2 s
Wall time: 55.4 s


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'Word2Vec'
params = ''
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


Xtr2 = np.c_[Xtr, df_Xtrain[features].to_numpy()]
Xvd2 = np.c_[Xvd, df_Xvalid[features].to_numpy()]

model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'Word2Vec'
params = 'Extra'
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


Xtr2 = np.c_[Xtr, df_Xtrain[features].to_numpy()]
Xvd2 = np.c_[Xvd, df_Xvalid[features].to_numpy()]

model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'Word2Vec'
params = 'Extra+Scaling'
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=True,
            df_eval=df_eval)


from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.csr import csr_matrix


vec_tfidf = TfidfVectorizer()

# vec_tfidf.fit(df_Xtrain[mce])
# csr_Xtrain = vec_tfidf.transform(df_Xtrain[mce])
# csr_Xvalid = vec_tfidf.transform(df_Xvalid[mce])

csr_Xtrain = vec_tfidf.fit_transform(df_Xtrain[mce])
csr_Xvalid = vec_tfidf.fit_transform(df_Xvalid[mce])

Xtr = csr_Xtrain
Xvd = csr_Xvalid

csr_Xtrain_extra = csr_matrix(df_Xtrain[features].to_numpy())
csr_Xvalid_extra = csr_matrix(df_Xvalid[features].to_numpy())

Xtr2 = scipy.sparse.hstack([csr_Xtrain, csr_Xtrain_extra])
Xvd2 = scipy.sparse.hstack([csr_Xvalid, csr_Xvalid_extra])


# scipy.sparse.save_npz('../data/processed/tfidf_default_Xtr.npz', csr_Xtrain)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xvd.npz', csr_Xvalid)

# scipy.sparse.save_npz('../data/processed/tfidf_default_Xtr2.npz', Xtr2)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xvd2.npz', Xvd2)


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'tfidf'
params = ''
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'tfidf'
params = 'Extra'
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'tfidf'
params = 'Extra+Scaling'
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr2.A,ytr,Xvd2.A,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=True,
            df_eval=df_eval)


from sklearn.svm import LinearSVC


model = LinearSVC(random_state=SEED, max_iter=200)

text_model_name = 'tfidf'
params = ''
model_name = 'svc'
desc = 'max_iter=200'

df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval,only='tfidf')

	index	id	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone']	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	128	13	1	13	2	5	0	2	8.923077	0.039062	1.0	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']	#yay #Sony #Xperia #S #sonyexperias…	131	17	1	17	5	12	0	3	6.764706	0.091603	1.0	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
7918	7918	7919	Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/	[finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget]	finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget	['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#']	#smart #pocket #wifi #ipad #samsung #s3 #gadget #	133	16	1	16	1	5	1	3	7.375000	0.037594	1.0	[finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget]	finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget
7919	7919	7920	Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/	[aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew]	aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew	['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…']	#Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew…	129	13	1	13	5	12	3	2	9.000000	0.093023	1.0	[aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew]	aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_question_marks	num_punctuation	num_symbols	num_digits	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	0.0	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone']	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	128	13	1	13	2	5	0	0	2	0	0	8.923077	0.039062	1.0	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	0.0	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']	#yay #Sony #Xperia #S #sonyexperias…	131	17	1	17	5	12	0	0	3	0	0	6.764706	0.091603	1.0	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri

	Text Model	Params	Model	Description	F1 Weighted	Time Taken	Accuracy	Precision	Recall	Time Taken Sec
0	BoW	Extra+Scaling	logregcv	cv=2	0.874369	0.790960	0.691358	0.871478	28.41 sec	28.405608
1	Word2Vec		logregcv	cv=2	0.870581	0.770270	0.703704	0.868605	3.69 sec	3.688961
2	BoW		logregcv	cv=2	0.862387	2.60 sec	0.864899	0.763085	0.683951	2.596540
3	BoW	Scaling	logregcv	cv=2	0.858586	0.768546	0.639506	0.854088	25.52 sec	25.518958
4	BoW	Scaling	linear svc	max_iter=200	0.854798	0.756598	0.637037	0.850488	0.56 sec	0.561049

	Text Model	Params	Model	Description	F1 Weighted	Time Taken	Accuracy	Precision	Recall	Time Taken Sec
0	Word2Vec	Extra	logregcv	cv=2	0.885101	0.775309	0.775309	0.885101	23.28 sec	23.282527
1	BoW	Extra+Scaling	logregcv	cv=2	0.874369	0.790960	0.691358	0.871478	28.41 sec	28.405608
2	Word2Vec		logregcv	cv=2	0.870581	0.770270	0.703704	0.868605	3.69 sec	3.688961
3	BoW		logregcv	cv=2	0.862387	2.60 sec	0.864899	0.763085	0.683951	2.596540
4	BoW	Scaling	logregcv	cv=2	0.858586	0.768546	0.639506	0.854088	25.52 sec	25.518958
5	BoW	Scaling	linear svc	max_iter=200	0.854798	0.756598	0.637037	0.850488	0.56 sec	0.561049

	Text Model	Params	Model	Description	F1 Weighted	Time Taken	Accuracy	Precision	Recall	Time Taken Sec
0	Word2Vec	Extra	logregcv	cv=2	0.885101	0.775309	0.775309	0.885101	23.28 sec	23.282527
1	Word2Vec	Extra+Scaling	logregcv	cv=2	0.882576	0.795148	0.728395	0.880838	8.48 sec	8.483309
2	BoW	Extra+Scaling	logregcv	cv=2	0.874369	0.790960	0.691358	0.871478	28.41 sec	28.405608
3	Word2Vec		logregcv	cv=2	0.870581	0.770270	0.703704	0.868605	3.69 sec	3.688961
4	BoW		logregcv	cv=2	0.862387	2.60 sec	0.864899	0.763085	0.683951	2.596540
5	BoW	Scaling	logregcv	cv=2	0.858586	0.768546	0.639506	0.854088	25.52 sec	25.518958
6	BoW	Scaling	linear svc	max_iter=200	0.854798	0.756598	0.637037	0.850488	0.56 sec	0.561049

Table of Contents

Description¶

Load the libraries¶

Load the data¶

Train test split¶

Modelling¶

BoW (CountVectorizer)¶

Bow + ExtraFeatures¶

Word2Vec¶

Tf-idf¶

Logistic RegressionCV¶

Linear SVC¶