import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')


import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import mlxtend
import plotly_express as px

pd.options.plotting.backend = "plotly"
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',1000)

import time,os,json,sys
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print([(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,mlxtend,px]])

#=========Visualization
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)


#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer


print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])

#=======OTHERS
import ast
import scipy
import multiprocessing as mp
import gc
import operator
from collections import defaultdict

#=====Warnigns
import warnings
warnings.simplefilter("ignore")
# plottig warnings

[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('sklearn', '0.23.1'), ('mlxtend', '0.17.0'), ('plotly_express', '0.4.1')]

[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]


df_combined = pd.read_csv('../data/processed/df_combined_clean.csv')

# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

# we need to make list as list type
df_combined[mcl] = df_combined[mcl].apply(ast.literal_eval)
df_combined[mcle] = df_combined[mcle].apply(ast.literal_eval)

df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]

print(f"shape df_train: {df_train.shape}")
print(f"shape df_test: {df_test.shape}")

df_train.head(2).append(df_train.tail(2))

shape df_train: (7920, 24)
shape df_test: (1953, 24)


from sklearn.model_selection import train_test_split

target = target

# df_train has labels but df_test does not have it.
# break df_train into train and valid.

df_Xtrain_orig = df_train
ser_ytrain_orig = df_train[target]
df_Xtest = df_test
ser_ytest = None # it does not exist

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_train, df_train[target],
    test_size=0.2, random_state=SEED, stratify=df_train[target])

ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()

ytr = ytrain
yvd = yvalid


print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")

print(f"\ndf_Xtrain      : {df_Xtrain.shape}")
print(f"ser_ytrain     : {ser_ytrain.shape}")

print(f"\ndf_Xvalid      : {df_Xvalid.shape}")
print(f"ser_yvalid     : {ser_yvalid.shape}")


df_Xtrain_orig.head(2)

df_Xtrain_orig : (7920, 24)
ser_ytrain_orig: (7920,)

df_Xtrain      : (6336, 24)
ser_ytrain     : (6336,)

df_Xvalid      : (1584, 24)
ser_yvalid     : (1584,)


features = ['total_length',
    'num_words', 'num_sent',
    'num_unique_words', 'num_words_title',
    'num_uppercase','num_exclamation_marks',
    'num_question_marks','num_punctuation',
    'num_symbols', 'num_digits',
    'avg_word_len', 'avg_uppercase','avg_unique']


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict


df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Weighted': [],
    'Time Taken': [],
    'Time Taken Sec': [],
})


import time
from sklearn import metrics

def get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            df_eval=df_eval,
            scaling=False,
            sort_col = 'F1 Weighted',
            N=None,disp=True,only=None):
    
    if scaling:
        scaler = MinMaxScaler(feature_range=(0,1))
        scaler.fit(Xtr)
        Xtr = scaler.transform(Xtr)
        Xvd = scaler.transform(Xvd)

    time_start = time.time()
    model.fit(Xtr,ytr)

    skf = StratifiedKFold(n_splits=3, 
                          random_state=SEED,
                          shuffle=True)
    vd_preds = cross_val_predict(model,Xvd,yvd,cv=skf,n_jobs=-1)

    acc = metrics.accuracy_score(yvd,vd_preds)
    pre = metrics.precision_score(yvd,vd_preds)
    rec = metrics.recall_score(yvd,vd_preds)
    f1 = metrics.f1_score(yvd,vd_preds,average='weighted')
    time_taken_sec = time.time() - time_start
    m,s = divmod(time_taken_sec,60)
    time_taken = f"{s:.2f} sec" if not m else f"{m} min {s:.2f} sec"

    row = [text_model_name, params, model_name,desc]
    row = row + [acc, pre, rec, f1, time_taken, time_taken_sec]

    df_eval.loc[len(df_eval)] = row
    df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])
    df_eval = df_eval.sort_values(sort_col,ascending=False)
    
    # reorder columns
    df_eval = df_eval[['Text Model', 'Params', 'Model',
                       'Description', 'F1 Weighted',
                       'Time Taken','Accuracy',
                       'Precision','Recall',
                        'Time Taken Sec']]
    
    if disp:
            display(df_eval.head(N).style.background_gradient(
            subset=[sort_col]))
        
    return df_eval


from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.csr import csr_matrix


vec_tfidf = TfidfVectorizer()

# vec_tfidf.fit(df_Xtrain[mce])
# csr_Xtrain = vec_tfidf.transform(df_Xtrain[mce])
# csr_Xvalid = vec_tfidf.transform(df_Xvalid[mce])

csr_Xtrain = vec_tfidf.fit_transform(df_Xtrain[mce])
csr_Xvalid = vec_tfidf.fit_transform(df_Xvalid[mce])

Xtr = csr_Xtrain
Xvd = csr_Xvalid

csr_Xtrain_extra = csr_matrix(df_Xtrain[features].to_numpy())
csr_Xvalid_extra = csr_matrix(df_Xvalid[features].to_numpy())

Xtr2 = scipy.sparse.hstack([csr_Xtrain, csr_Xtrain_extra])
Xvd2 = scipy.sparse.hstack([csr_Xvalid, csr_Xvalid_extra])


# scipy.sparse.save_npz('../data/processed/tfidf_default_Xtr.npz', csr_Xtrain)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xvd.npz', csr_Xvalid)

# scipy.sparse.save_npz('../data/processed/tfidf_default_Xtr2.npz', Xtr2)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xvd2.npz', Xvd2)


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'tfidf'
params = ''
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'tfidf'
params = 'Extra'
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)

text_model_name = 'tfidf'
params = 'Extra+Scaling'
model_name = 'logregcv'
desc = 'cv=2'

df_eval = get_model_evaluation(Xtr2.A,ytr,Xvd2.A,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=True,
            df_eval=df_eval)


from sklearn.svm import LinearSVC


model = LinearSVC(random_state=SEED, max_iter=200)

text_model_name = 'tfidf'
params = ''
model_name = 'svc'
desc = 'max_iter=200'

df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


from sklearn.naive_bayes import GaussianNB, BernoulliNB


model = GaussianNB() # needs dense matrix

text_model_name = 'tfidf'
params = ''
model_name = 'gnb'
desc = ''

df_eval = get_model_evaluation(Xtr.A,ytr,Xvd.A,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


model = BernoulliNB() # needs dense matrix

text_model_name = 'tfidf'
params = ''
model_name = 'bernoulli'
desc = ''

df_eval = get_model_evaluation(Xtr.A,ytr,Xvd.A,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier(n_estimators=1000,n_jobs=-1,random_state=SEED)

text_model_name = 'tfidf'
params = 'n_estimators=1000'
model_name = 'rf'
desc = ''

df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


from sklearn.linear_model import SGDClassifier
# SGDClassifier?


model = SGDClassifier(n_jobs=-1,random_state=SEED)

text_model_name = 'tfidf'
params = ''
model_name = 'sgd'
desc = ''

df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
            text_model_name,params,
            model_name,desc,model,
            scaling=False,
            df_eval=df_eval)


from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, StratifiedKFold

scorer = metrics.make_scorer(metrics.f1_score,average='weighted')

skf = StratifiedKFold(n_splits=3,random_state=SEED,shuffle=True)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=SEED)


def plot_grid_param_C(df_grid):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_grid['param_C'],
                             y=df_grid['mean_train_score'],
                             mode='lines+markers',
                             name='Train F1'))

    fig.add_trace(go.Scatter(x=df_grid['param_C'],
                             y=df_grid['mean_test_score'],
                             mode='lines+markers',
                             name='CV F1'))
    fig['layout']['title'] = 'Cross validation scores'
    fig['layout']['title']['x'] = 0.5
    fig['layout']['xaxis']['title'] = 'C : Hyperparameter'
    fig['layout']['yaxis']['title'] = 'F1'
    fig.show()


note = """
%%time

params = {'C': [0.001,0.01,0.1,1,10]}

model = LogisticRegression(random_state=SEED,max_iter=10_000)
grid = GridSearchCV(model,params,n_jobs=-1,scoring=scorer,
                    cv=rskf,return_train_score=True)

grid.fit(Xtr,ytr)
print(grid.best_score_, grid.best_params_)

cols = [ 'params', 'mean_test_score', 'std_test_score' ]
df_grid = pd.DataFrame(grid.cv_results_).sort_values('mean_test_score',ascending=False)

display(df_grid[cols].style.background_gradient(subset=['mean_test_score']))

plot_grid_param_C(df_grid)

0.8741918227069756 {'C': 10}
""";


note = """
%%time

params = {'C': [i/10 for i in range(60,80)]}
model = LogisticRegression(random_state=SEED,max_iter=10_000)

grid = GridSearchCV(model,params,n_jobs=-1,scoring=scorer,
                    cv=rskf,return_train_score=True)

grid.fit(Xtr,ytr)
print(grid.best_score_, grid.best_params_)

cols = [ 'params', 'mean_test_score', 'std_test_score' ]
df_grid = pd.DataFrame(grid.cv_results_).sort_values('mean_test_score',ascending=False)

display(df_grid[cols].style.background_gradient(subset=['mean_test_score']))

plot_grid_param_C(df_grid)

0.8753013175137699 {'C': 6.2}
"""


from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer


%%time

pipe = Pipeline([
    ('col_selector', ColumnSelector(cols=(mce),drop_axis=True)),
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(random_state=SEED,max_iter=1_000,C=6.3)),
])


pipe.fit(df_Xtrain,ytrain)

vd_preds = pipe.predict(df_Xvalid)

f1 = metrics.f1_score(vd_preds,yvalid,average='weighted')
print(f1)

0.8966844029633191
CPU times: user 292 ms, sys: 6.82 ms, total: 298 ms
Wall time: 339 ms


%%time
note = """
params = {
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__max_df': (0.25, 0.5, 0.75,1.0,2.0),
    'tfidf__min_df': (0.25, 0.5, 0.75,1.0,2.0),
    'tfidf__ngram_range': ((1, 1), (1, 2)),
}

pipe = Pipeline([
    ('col_selector', ColumnSelector(cols=(mce),drop_axis=True)),
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(random_state=SEED,max_iter=1_000,C=6.3)),
])


grid = GridSearchCV(pipe,params,n_jobs=-1,scoring=scorer,cv=3,return_train_score=False)

grid.fit(df_Xtrain,ytrain)
print(grid.best_score_, grid.best_params_)

0.7303171363757338 {'tfidf__max_df': 0.5, 'tfidf__min_df': 0.25, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': 'l2'}
CPU times: user 16.4 s, sys: 943 ms, total: 17.4 s
Wall time: 28.7 s
    
"""

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


# cols = [ 'params', 'mean_test_score', 'std_test_score' ]
# df_grid = pd.DataFrame(grid.cv_results_).sort_values('mean_test_score',ascending=False)

# df_grid[cols].head().style.background_gradient(subset=['mean_test_score'])


model = LogisticRegression(C=6.3, random_state=SEED, max_iter=1000)
vd_preds = cross_val_predict(model,Xvd,yvd,n_jobs=-1)


(yvd==0.0).sum()

1179


pd.crosstab(yvd,vd_preds,margins=True,normalize=False)


pd.crosstab(yvd,vd_preds,margins=False,normalize='columns')


print(metrics.classification_report(yvd,vd_preds))

              precision    recall  f1-score   support

         0.0       0.87      0.96      0.91      1179
         1.0       0.81      0.57      0.67       405

    accuracy                           0.86      1584
   macro avg       0.84      0.76      0.79      1584
weighted avg       0.85      0.86      0.85      1584

	index	id	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone']	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	128	13	1	13	2	5	0	2	8.923077	0.039062	1.0	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']	#yay #Sony #Xperia #S #sonyexperias…	131	17	1	17	5	12	0	3	6.764706	0.091603	1.0	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
7918	7918	7919	Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/	[finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget]	finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget	['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#']	#smart #pocket #wifi #ipad #samsung #s3 #gadget #	133	16	1	16	1	5	1	3	7.375000	0.037594	1.0	[finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget]	finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget
7919	7919	7920	Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/	[aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew]	aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew	['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…']	#Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew…	129	13	1	13	5	12	3	2	9.000000	0.093023	1.0	[aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew]	aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_question_marks	num_punctuation	num_symbols	num_digits	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	0.0	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone']	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	128	13	1	13	2	5	0	0	2	0	0	8.923077	0.039062	1.0	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	0.0	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']	#yay #Sony #Xperia #S #sonyexperias…	131	17	1	17	5	12	0	0	3	0	0	6.764706	0.091603	1.0	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri

	Text Model	Params	Model	Description	F1 Weighted	Time Taken	Accuracy	Precision	Recall	Time Taken Sec
1	tfidf	Extra	logregcv	cv=2	0.876263	0.772846	0.730864	0.875107	32.41 sec	32.407605
2	tfidf	Extra+Scaling	logregcv	cv=2	0.873737	0.790368	0.688889	0.870769	30.90 sec	30.895569
3	tfidf		svc	max_iter=200	0.855429	0.795302	0.585185	0.847565	0.14 sec	0.135881
0	tfidf		logregcv	cv=2	0.849470	3.18 sec	0.855429	0.775000	0.612346	3.175989
4	tfidf		gnb		0.796086	0.621302	0.518519	0.789709	1.67 sec	1.666834

	Text Model	Params	Model	Description	F1 Weighted	Time Taken	Accuracy	Precision	Recall	Time Taken Sec
1	tfidf	Extra	logregcv	cv=2	0.876263	0.772846	0.730864	0.875107	32.41 sec	32.407605
2	tfidf	Extra+Scaling	logregcv	cv=2	0.873737	0.790368	0.688889	0.870769	30.90 sec	30.895569
3	tfidf		svc	max_iter=200	0.855429	0.795302	0.585185	0.847565	0.14 sec	0.135881
0	tfidf		logregcv	cv=2	0.849470	3.18 sec	0.855429	0.775000	0.612346	3.175989
5	tfidf		bernoulli		0.823232	0.804878	0.407407	0.801164	1.07 sec	1.073401
4	tfidf		gnb		0.796086	0.621302	0.518519	0.789709	1.67 sec	1.666834

	Text Model	Params	Model	Description	F1 Weighted	Time Taken	Accuracy	Precision	Recall	Time Taken Sec
1	tfidf	Extra	logregcv	cv=2	0.876263	0.772846	0.730864	0.875107	32.41 sec	32.407605
2	tfidf	Extra+Scaling	logregcv	cv=2	0.873737	0.790368	0.688889	0.870769	30.90 sec	30.895569
3	tfidf		svc	max_iter=200	0.855429	0.795302	0.585185	0.847565	0.14 sec	0.135881
0	tfidf		logregcv	cv=2	0.849470	3.18 sec	0.855429	0.775000	0.612346	3.175989
6	tfidf	n_estimators=1000	rf		0.832071	0.849246	0.417284	0.810180	16.32 sec	16.320531
5	tfidf		bernoulli		0.823232	0.804878	0.407407	0.801164	1.07 sec	1.073401
4	tfidf		gnb		0.796086	0.621302	0.518519	0.789709	1.67 sec	1.666834

Table of Contents

Description¶

Load the libraries¶

Load the data¶

Train test split¶

Modelling¶

TF-IDF Vectorizing¶

Logistic RegressionCV¶

Linear SVC¶

GaussianNB¶

Random Forest¶

SGD Classifier¶

Hyperparameter Tuning¶

Tuning TF-IDF Vectorizer using Pipeline¶

Best Model¶

col_0	0.0	1.0	All
row_0
0.0	1126	53	1179
1.0	174	231	405
All	1300	284	1584

col_0	0.0	1.0
row_0
0.0	0.866154	0.18662
1.0	0.133846	0.81338