import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import mlxtend
import plotly_express as px
pd.options.plotting.backend = "plotly"
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',1000)
import time,os,json,sys
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
print([(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,mlxtend,px]])
#=========Visualization
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])
#=======OTHERS
import ast
import scipy
import multiprocessing as mp
import gc
import operator
from collections import defaultdict
#=====Warnigns
import warnings
warnings.simplefilter("ignore")
[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('sklearn', '0.23.1'), ('mlxtend', '0.17.0'), ('plotly_express', '0.4.1')]
[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]
df_combined = pd.read_csv('../data/processed/df_combined_clean.csv')
# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'
# we need to make list as list type
df_combined[mcl] = df_combined[mcl].apply(ast.literal_eval)
df_combined[mcle] = df_combined[mcle].apply(ast.literal_eval)
df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]
print(f"shape df_train: {df_train.shape}")
print(f"shape df_test: {df_test.shape}")
df_train.head(2).append(df_train.tail(2))
shape df_train: (7920, 24) shape df_test: (1953, 24)
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 128 | 13 | 1 | 13 | 2 | 5 | 0 | 0 | 2 | 0 | 0 | 8.923077 | 0.039062 | 1.0 | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] | #yay #Sony #Xperia #S #sonyexperias… | 131 | 17 | 1 | 17 | 5 | 12 | 0 | 0 | 3 | 0 | 0 | 6.764706 | 0.091603 | 1.0 | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
7918 | 7918 | 7919 | 0.0 | Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/ | [finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget] | finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget | ['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#'] | #smart #pocket #wifi #ipad #samsung #s3 #gadget # | 133 | 16 | 1 | 16 | 1 | 5 | 1 | 0 | 3 | 0 | 0 | 7.375000 | 0.037594 | 1.0 | [finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget] | finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget |
7919 | 7919 | 7920 | 0.0 | Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/ | [aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew] | aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew | ['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…'] | #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… | 129 | 13 | 1 | 13 | 5 | 12 | 3 | 0 | 2 | 0 | 0 | 9.000000 | 0.093023 | 1.0 | [aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew] | aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew |
from sklearn.model_selection import train_test_split
target = target
# df_train has labels but df_test does not have it.
# break df_train into train and valid.
df_Xtrain_orig = df_train
ser_ytrain_orig = df_train[target]
df_Xtest = df_test
ser_ytest = None # it does not exist
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_train, df_train[target],
test_size=0.2, random_state=SEED, stratify=df_train[target])
ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")
print(f"\ndf_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}")
print(f"\ndf_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}")
df_Xtrain_orig.head(2)
df_Xtrain_orig : (7920, 24) ser_ytrain_orig: (7920,) df_Xtrain : (6336, 24) ser_ytrain : (6336,) df_Xvalid : (1584, 24) ser_yvalid : (1584,)
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 128 | 13 | 1 | 13 | 2 | 5 | 0 | 0 | 2 | 0 | 0 | 8.923077 | 0.039062 | 1.0 | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] | #yay #Sony #Xperia #S #sonyexperias… | 131 | 17 | 1 | 17 | 5 | 12 | 0 | 0 | 3 | 0 | 0 | 6.764706 | 0.091603 | 1.0 | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
features = ['total_length',
'num_words', 'num_sent',
'num_unique_words', 'num_words_title',
'num_uppercase','num_exclamation_marks',
'num_question_marks','num_punctuation',
'num_symbols', 'num_digits',
'avg_word_len', 'avg_uppercase','avg_unique']
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
df_eval = pd.DataFrame({
'Text Model': [],
'Params': [],
'Model': [],
'Description': [],
'Accuracy': [],
'Precision': [],
'Recall': [],
'F1 Weighted': [],
'Time Taken': [],
'Time Taken Sec': [],
})
import time
from sklearn import metrics
def get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
df_eval=df_eval,
scaling=False,
sort_col = 'F1 Weighted',
N=None,disp=True,only=None):
if scaling:
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(Xtr)
Xtr = scaler.transform(Xtr)
Xvd = scaler.transform(Xvd)
time_start = time.time()
model.fit(Xtr,ytr)
skf = StratifiedKFold(n_splits=3,
random_state=SEED,
shuffle=True)
vd_preds = cross_val_predict(model,Xvd,yvd,cv=skf,n_jobs=-1)
acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')
time_taken_sec = time.time() - time_start
m,s = divmod(time_taken_sec,60)
time_taken = f"{s:.2f} sec" if not m else f"{m} min {s:.2f} sec"
row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1, time_taken, time_taken_sec]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])
df_eval = df_eval.sort_values(sort_col,ascending=False)
df_eval = df_eval.reset_index(drop=False)
# reorder columns
df_eval = df_eval[['Text Model', 'Params', 'Model',
'Description', 'F1 Weighted',
'Time Taken','Accuracy',
'Precision','Recall',
'Time Taken Sec']]
if only:
df_eval2 = df_eval[df_eval['Text Model']==only]
df_eval2 = df_eval.nlargest(1,'F1 Weighted').append(df_eval2)
if disp:
if only:
display(df_eval2.head(N).style.background_gradient(
subset=[sort_col]))
else:
display(df_eval.head(N).style.background_gradient(
subset=[sort_col]))
return df_eval
CountVectorizer
, TfidfVecorizer
ktrain
, simpletransformers
from sklearn.feature_extraction.text import CountVectorizer
vec_cv = CountVectorizer()
csr_Xtrain_orig = vec_cv.fit_transform(df_Xtrain_orig[mce])
csr_Xtrain = vec_cv.fit_transform(df_Xtrain[mce])
csr_Xvalid = vec_cv.fit_transform(df_Xvalid[mce])
# current variables
Xtr = csr_Xtrain
Xvd = csr_Xvalid
ytr = ytrain
yvd = yvalid
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'BoW'
params = ''
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 |
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'BoW'
params = 'Scaling' # needs dense array
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr.A,ytr,Xvd.A,yvd,
text_model_name,params,
model_name,desc,model,
scaling=True,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
1 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
model = LinearSVC(random_state=SEED, max_iter=200)
text_model_name = 'BoW'
params = 'Scaling' # needs dense array
model_name = 'linear svc'
desc = 'max_iter=200'
df_eval = get_model_evaluation(Xtr.A,ytr,Xvd.A,yvd,
text_model_name,params,
model_name,desc,model,
scaling=True,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
1 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
2 | BoW | Scaling | linear svc | max_iter=200 | 0.854798 | 0.756598 | 0.637037 | 0.850488 | 0.56 sec | 0.561049 |
Xtr2 = np.c_[Xtr.A, df_Xtrain[features].to_numpy()]
Xvd2 = np.c_[Xvd.A, df_Xvalid[features].to_numpy()]
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'BoW'
params = 'Extra+Scaling' # needs dense array
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
text_model_name,params,
model_name,desc,model,
scaling=True,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | BoW | Extra+Scaling | logregcv | cv=2 | 0.874369 | 0.790960 | 0.691358 | 0.871478 | 28.41 sec | 28.405608 |
1 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
2 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
3 | BoW | Scaling | linear svc | max_iter=200 | 0.854798 | 0.756598 | 0.637037 | 0.850488 | 0.56 sec | 0.561049 |
import functools
@functools.lru_cache()
def get_word_vecs(col):
nlp = spacy.load('en_core_web_lg')
f = lambda x: nlp(x).vector.reshape(1, -1)
Xtr = df_Xtrain[col].apply(f).to_numpy()
Xvd = df_Xvalid[col].apply(f).to_numpy()
Xtr = np.concatenate(Xtr, axis=0)
Xvd = np.concatenate(Xvd, axis=0)
return [Xtr,Xvd]
%%time
Xtr,Xvd = get_word_vecs(mce)
# Wall time: 1min 3s
CPU times: user 54 s, sys: 1.17 s, total: 55.2 s Wall time: 55.4 s
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'Word2Vec'
params = ''
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | BoW | Extra+Scaling | logregcv | cv=2 | 0.874369 | 0.790960 | 0.691358 | 0.871478 | 28.41 sec | 28.405608 |
1 | Word2Vec | logregcv | cv=2 | 0.870581 | 0.770270 | 0.703704 | 0.868605 | 3.69 sec | 3.688961 | |
2 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
3 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
4 | BoW | Scaling | linear svc | max_iter=200 | 0.854798 | 0.756598 | 0.637037 | 0.850488 | 0.56 sec | 0.561049 |
Xtr2 = np.c_[Xtr, df_Xtrain[features].to_numpy()]
Xvd2 = np.c_[Xvd, df_Xvalid[features].to_numpy()]
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'Word2Vec'
params = 'Extra'
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Word2Vec | Extra | logregcv | cv=2 | 0.885101 | 0.775309 | 0.775309 | 0.885101 | 23.28 sec | 23.282527 |
1 | BoW | Extra+Scaling | logregcv | cv=2 | 0.874369 | 0.790960 | 0.691358 | 0.871478 | 28.41 sec | 28.405608 |
2 | Word2Vec | logregcv | cv=2 | 0.870581 | 0.770270 | 0.703704 | 0.868605 | 3.69 sec | 3.688961 | |
3 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
4 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
5 | BoW | Scaling | linear svc | max_iter=200 | 0.854798 | 0.756598 | 0.637037 | 0.850488 | 0.56 sec | 0.561049 |
Xtr2 = np.c_[Xtr, df_Xtrain[features].to_numpy()]
Xvd2 = np.c_[Xvd, df_Xvalid[features].to_numpy()]
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'Word2Vec'
params = 'Extra+Scaling'
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
text_model_name,params,
model_name,desc,model,
scaling=True,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Word2Vec | Extra | logregcv | cv=2 | 0.885101 | 0.775309 | 0.775309 | 0.885101 | 23.28 sec | 23.282527 |
1 | Word2Vec | Extra+Scaling | logregcv | cv=2 | 0.882576 | 0.795148 | 0.728395 | 0.880838 | 8.48 sec | 8.483309 |
2 | BoW | Extra+Scaling | logregcv | cv=2 | 0.874369 | 0.790960 | 0.691358 | 0.871478 | 28.41 sec | 28.405608 |
3 | Word2Vec | logregcv | cv=2 | 0.870581 | 0.770270 | 0.703704 | 0.868605 | 3.69 sec | 3.688961 | |
4 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
5 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
6 | BoW | Scaling | linear svc | max_iter=200 | 0.854798 | 0.756598 | 0.637037 | 0.850488 | 0.56 sec | 0.561049 |
Term Frequency : This gives how often a given word appears within a document.
$\mathrm{TF}=\frac{\text { Number of times the term appears in the doc }}{\text { Total number of words in the doc }}$
Inverse Document Frequency: This gives how often the word appers across the documents. If a term is very common among documents (e.g., “the”, “a”, “is”), then we have low IDF score.
$\mathrm{IDF}=\ln \left(\frac{\text { Number of docs }}{\text { Number docs the term appears in }}\right)$
Term Frequency – Inverse Document Frequency TF-IDF: TF-IDF is the product of the TF and IDF scores of the term.
$\mathrm{TF}\mathrm{IDF}=\mathrm{TF} * \mathrm{IDF}$
In machine learning, TF-IDF is obtained from the class TfidfVectorizer
.
It has following parameters:
min_df
: remove the words from the vocabulary which have occurred in less than "min_df"
number of files.max_df
: remove the words from the vocabulary which have occurred in more than _{ maxdf" }
total number of files in corpus.sublinear_tf
: set to True to scale the term frequency in logarithmic scale.stop_words
: remove the predefined stop words in 'english':use_idf
: weight factor must use inverse document frequency.ngram_range
: (1,2) to indicate that unigrams and bigrams will be considered.NOTE:
TF
is same in sklearn and textbook but IDF
if different (to address divide by zero problem)Ref: https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
Here, df(t)
is is the number of documents in the document set that contain term t in it.
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.csr import csr_matrix
vec_tfidf = TfidfVectorizer()
# vec_tfidf.fit(df_Xtrain[mce])
# csr_Xtrain = vec_tfidf.transform(df_Xtrain[mce])
# csr_Xvalid = vec_tfidf.transform(df_Xvalid[mce])
csr_Xtrain = vec_tfidf.fit_transform(df_Xtrain[mce])
csr_Xvalid = vec_tfidf.fit_transform(df_Xvalid[mce])
Xtr = csr_Xtrain
Xvd = csr_Xvalid
csr_Xtrain_extra = csr_matrix(df_Xtrain[features].to_numpy())
csr_Xvalid_extra = csr_matrix(df_Xvalid[features].to_numpy())
Xtr2 = scipy.sparse.hstack([csr_Xtrain, csr_Xtrain_extra])
Xvd2 = scipy.sparse.hstack([csr_Xvalid, csr_Xvalid_extra])
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xtr.npz', csr_Xtrain)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xvd.npz', csr_Xvalid)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xtr2.npz', Xtr2)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xvd2.npz', Xvd2)
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'tfidf'
params = ''
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Word2Vec | Extra | logregcv | cv=2 | 0.885101 | 0.775309 | 0.775309 | 0.885101 | 23.28 sec | 23.282527 |
1 | Word2Vec | Extra+Scaling | logregcv | cv=2 | 0.882576 | 0.795148 | 0.728395 | 0.880838 | 8.48 sec | 8.483309 |
2 | BoW | Extra+Scaling | logregcv | cv=2 | 0.874369 | 0.790960 | 0.691358 | 0.871478 | 28.41 sec | 28.405608 |
3 | Word2Vec | logregcv | cv=2 | 0.870581 | 0.770270 | 0.703704 | 0.868605 | 3.69 sec | 3.688961 | |
4 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
5 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
6 | tfidf | logregcv | cv=2 | 0.855429 | 0.775000 | 0.612346 | 0.849470 | 2.05 sec | 2.048890 | |
7 | BoW | Scaling | linear svc | max_iter=200 | 0.854798 | 0.756598 | 0.637037 | 0.850488 | 0.56 sec | 0.561049 |
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'tfidf'
params = 'Extra'
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Word2Vec | Extra | logregcv | cv=2 | 0.885101 | 0.775309 | 0.775309 | 0.885101 | 23.28 sec | 23.282527 |
1 | Word2Vec | Extra+Scaling | logregcv | cv=2 | 0.882576 | 0.795148 | 0.728395 | 0.880838 | 8.48 sec | 8.483309 |
2 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 29.80 sec | 29.803477 |
3 | BoW | Extra+Scaling | logregcv | cv=2 | 0.874369 | 0.790960 | 0.691358 | 0.871478 | 28.41 sec | 28.405608 |
4 | Word2Vec | logregcv | cv=2 | 0.870581 | 0.770270 | 0.703704 | 0.868605 | 3.69 sec | 3.688961 | |
5 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
6 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
7 | tfidf | logregcv | cv=2 | 0.855429 | 0.775000 | 0.612346 | 0.849470 | 2.05 sec | 2.048890 | |
8 | BoW | Scaling | linear svc | max_iter=200 | 0.854798 | 0.756598 | 0.637037 | 0.850488 | 0.56 sec | 0.561049 |
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'tfidf'
params = 'Extra+Scaling'
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr2.A,ytr,Xvd2.A,yvd,
text_model_name,params,
model_name,desc,model,
scaling=True,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Word2Vec | Extra | logregcv | cv=2 | 0.885101 | 0.775309 | 0.775309 | 0.885101 | 23.28 sec | 23.282527 |
1 | Word2Vec | Extra+Scaling | logregcv | cv=2 | 0.882576 | 0.795148 | 0.728395 | 0.880838 | 8.48 sec | 8.483309 |
2 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 29.80 sec | 29.803477 |
3 | BoW | Extra+Scaling | logregcv | cv=2 | 0.874369 | 0.790960 | 0.691358 | 0.871478 | 28.41 sec | 28.405608 |
4 | tfidf | Extra+Scaling | logregcv | cv=2 | 0.873737 | 0.790368 | 0.688889 | 0.870769 | 26.79 sec | 26.791343 |
5 | Word2Vec | logregcv | cv=2 | 0.870581 | 0.770270 | 0.703704 | 0.868605 | 3.69 sec | 3.688961 | |
6 | BoW | logregcv | cv=2 | 0.862387 | 2.60 sec | 0.864899 | 0.763085 | 0.683951 | 2.596540 | |
7 | BoW | Scaling | logregcv | cv=2 | 0.858586 | 0.768546 | 0.639506 | 0.854088 | 25.52 sec | 25.518958 |
8 | tfidf | logregcv | cv=2 | 0.855429 | 0.775000 | 0.612346 | 0.849470 | 2.05 sec | 2.048890 | |
9 | BoW | Scaling | linear svc | max_iter=200 | 0.854798 | 0.756598 | 0.637037 | 0.850488 | 0.56 sec | 0.561049 |
from sklearn.svm import LinearSVC
model = LinearSVC(random_state=SEED, max_iter=200)
text_model_name = 'tfidf'
params = ''
model_name = 'svc'
desc = 'max_iter=200'
df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval,only='tfidf')
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Word2Vec | Extra | logregcv | cv=2 | 0.885101 | 0.775309 | 0.775309 | 0.885101 | 23.28 sec | 23.282527 |
2 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 29.80 sec | 29.803477 |
4 | tfidf | Extra+Scaling | logregcv | cv=2 | 0.873737 | 0.790368 | 0.688889 | 0.870769 | 26.79 sec | 26.791343 |
8 | tfidf | logregcv | cv=2 | 0.855429 | 0.775000 | 0.612346 | 0.849470 | 2.05 sec | 2.048890 | |
9 | tfidf | svc | max_iter=200 | 0.855429 | 0.795302 | 0.585185 | 0.847565 | 0.07 sec | 0.069284 |