import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import mlxtend
import plotly_express as px
pd.options.plotting.backend = "plotly"
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',1000)
import time,os,json,sys
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
print([(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,mlxtend,px]])
#=========Visualization
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])
#=======OTHERS
import ast
import scipy
import multiprocessing as mp
import gc
import operator
from collections import defaultdict
#=====Warnigns
import warnings
warnings.simplefilter("ignore")
# plottig warnings
[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1'), ('sklearn', '0.23.1'), ('mlxtend', '0.17.0'), ('plotly_express', '0.4.1')]
[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]
df_combined = pd.read_csv('../data/processed/df_combined_clean.csv')
# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'
# we need to make list as list type
df_combined[mcl] = df_combined[mcl].apply(ast.literal_eval)
df_combined[mcle] = df_combined[mcle].apply(ast.literal_eval)
df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]
print(f"shape df_train: {df_train.shape}")
print(f"shape df_test: {df_test.shape}")
df_train.head(2).append(df_train.tail(2))
shape df_train: (7920, 24) shape df_test: (1953, 24)
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 128 | 13 | 1 | 13 | 2 | 5 | 0 | 0 | 2 | 0 | 0 | 8.923077 | 0.039062 | 1.0 | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] | #yay #Sony #Xperia #S #sonyexperias… | 131 | 17 | 1 | 17 | 5 | 12 | 0 | 0 | 3 | 0 | 0 | 6.764706 | 0.091603 | 1.0 | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
7918 | 7918 | 7919 | 0.0 | Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/ | [finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget] | finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget | ['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#'] | #smart #pocket #wifi #ipad #samsung #s3 #gadget # | 133 | 16 | 1 | 16 | 1 | 5 | 1 | 0 | 3 | 0 | 0 | 7.375000 | 0.037594 | 1.0 | [finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget] | finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget |
7919 | 7919 | 7920 | 0.0 | Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/ | [aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew] | aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew | ['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…'] | #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… | 129 | 13 | 1 | 13 | 5 | 12 | 3 | 0 | 2 | 0 | 0 | 9.000000 | 0.093023 | 1.0 | [aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew] | aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew |
from sklearn.model_selection import train_test_split
target = target
# df_train has labels but df_test does not have it.
# break df_train into train and valid.
df_Xtrain_orig = df_train
ser_ytrain_orig = df_train[target]
df_Xtest = df_test
ser_ytest = None # it does not exist
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_train, df_train[target],
test_size=0.2, random_state=SEED, stratify=df_train[target])
ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
ytr = ytrain
yvd = yvalid
print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")
print(f"\ndf_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}")
print(f"\ndf_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}")
df_Xtrain_orig.head(2)
df_Xtrain_orig : (7920, 24) ser_ytrain_orig: (7920,) df_Xtrain : (6336, 24) ser_ytrain : (6336,) df_Xvalid : (1584, 24) ser_yvalid : (1584,)
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 128 | 13 | 1 | 13 | 2 | 5 | 0 | 0 | 2 | 0 | 0 | 8.923077 | 0.039062 | 1.0 | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] | #yay #Sony #Xperia #S #sonyexperias… | 131 | 17 | 1 | 17 | 5 | 12 | 0 | 0 | 3 | 0 | 0 | 6.764706 | 0.091603 | 1.0 | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
features = ['total_length',
'num_words', 'num_sent',
'num_unique_words', 'num_words_title',
'num_uppercase','num_exclamation_marks',
'num_question_marks','num_punctuation',
'num_symbols', 'num_digits',
'avg_word_len', 'avg_uppercase','avg_unique']
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
df_eval = pd.DataFrame({
'Text Model': [],
'Params': [],
'Model': [],
'Description': [],
'Accuracy': [],
'Precision': [],
'Recall': [],
'F1 Weighted': [],
'Time Taken': [],
'Time Taken Sec': [],
})
import time
from sklearn import metrics
def get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
df_eval=df_eval,
scaling=False,
sort_col = 'F1 Weighted',
N=None,disp=True,only=None):
if scaling:
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(Xtr)
Xtr = scaler.transform(Xtr)
Xvd = scaler.transform(Xvd)
time_start = time.time()
model.fit(Xtr,ytr)
skf = StratifiedKFold(n_splits=3,
random_state=SEED,
shuffle=True)
vd_preds = cross_val_predict(model,Xvd,yvd,cv=skf,n_jobs=-1)
acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')
time_taken_sec = time.time() - time_start
m,s = divmod(time_taken_sec,60)
time_taken = f"{s:.2f} sec" if not m else f"{m} min {s:.2f} sec"
row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1, time_taken, time_taken_sec]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])
df_eval = df_eval.sort_values(sort_col,ascending=False)
# reorder columns
df_eval = df_eval[['Text Model', 'Params', 'Model',
'Description', 'F1 Weighted',
'Time Taken','Accuracy',
'Precision','Recall',
'Time Taken Sec']]
if disp:
display(df_eval.head(N).style.background_gradient(
subset=[sort_col]))
return df_eval
Term Frequency : This gives how often a given word appears within a document.
$\mathrm{TF}=\frac{\text { Number of times the term appears in the doc }}{\text { Total number of words in the doc }}$
Inverse Document Frequency: This gives how often the word appers across the documents. If a term is very common among documents (e.g., “the”, “a”, “is”), then we have low IDF score.
$\mathrm{IDF}=\ln \left(\frac{\text { Number of docs }}{\text { Number docs the term appears in }}\right)$
Term Frequency – Inverse Document Frequency TF-IDF: TF-IDF is the product of the TF and IDF scores of the term.
$\mathrm{TF}\mathrm{IDF}=\mathrm{TF} * \mathrm{IDF}$
In machine learning, TF-IDF is obtained from the class TfidfVectorizer
.
It has following parameters:
min_df
: remove the words from the vocabulary which have occurred in less than "min_df"
number of files.max_df
: remove the words from the vocabulary which have occurred in more than _{ maxdf" }
total number of files in corpus.sublinear_tf
: set to True to scale the term frequency in logarithmic scale.stop_words
: remove the predefined stop words in 'english':use_idf
: weight factor must use inverse document frequency.ngram_range
: (1,2) to indicate that unigrams and bigrams will be considered.NOTE:
TF
is same in sklearn and textbook but IDF
if different (to address divide by zero problem)Ref: https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
Here, df(t)
is is the number of documents in the document set that contain term t in it.
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.csr import csr_matrix
vec_tfidf = TfidfVectorizer()
# vec_tfidf.fit(df_Xtrain[mce])
# csr_Xtrain = vec_tfidf.transform(df_Xtrain[mce])
# csr_Xvalid = vec_tfidf.transform(df_Xvalid[mce])
csr_Xtrain = vec_tfidf.fit_transform(df_Xtrain[mce])
csr_Xvalid = vec_tfidf.fit_transform(df_Xvalid[mce])
Xtr = csr_Xtrain
Xvd = csr_Xvalid
csr_Xtrain_extra = csr_matrix(df_Xtrain[features].to_numpy())
csr_Xvalid_extra = csr_matrix(df_Xvalid[features].to_numpy())
Xtr2 = scipy.sparse.hstack([csr_Xtrain, csr_Xtrain_extra])
Xvd2 = scipy.sparse.hstack([csr_Xvalid, csr_Xvalid_extra])
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xtr.npz', csr_Xtrain)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xvd.npz', csr_Xvalid)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xtr2.npz', Xtr2)
# scipy.sparse.save_npz('../data/processed/tfidf_default_Xvd2.npz', Xvd2)
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'tfidf'
params = ''
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
0 | tfidf | logregcv | cv=2 | 0.849470 | 3.18 sec | 0.855429 | 0.775000 | 0.612346 | 3.175989 |
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'tfidf'
params = 'Extra'
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr2,ytr,Xvd2,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
1 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 32.41 sec | 32.407605 |
0 | tfidf | logregcv | cv=2 | 0.849470 | 3.18 sec | 0.855429 | 0.775000 | 0.612346 | 3.175989 |
model = LogisticRegressionCV(cv=2, random_state=SEED, max_iter=1000)
text_model_name = 'tfidf'
params = 'Extra+Scaling'
model_name = 'logregcv'
desc = 'cv=2'
df_eval = get_model_evaluation(Xtr2.A,ytr,Xvd2.A,yvd,
text_model_name,params,
model_name,desc,model,
scaling=True,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
1 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 32.41 sec | 32.407605 |
2 | tfidf | Extra+Scaling | logregcv | cv=2 | 0.873737 | 0.790368 | 0.688889 | 0.870769 | 30.90 sec | 30.895569 |
0 | tfidf | logregcv | cv=2 | 0.849470 | 3.18 sec | 0.855429 | 0.775000 | 0.612346 | 3.175989 |
from sklearn.svm import LinearSVC
model = LinearSVC(random_state=SEED, max_iter=200)
text_model_name = 'tfidf'
params = ''
model_name = 'svc'
desc = 'max_iter=200'
df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
1 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 32.41 sec | 32.407605 |
2 | tfidf | Extra+Scaling | logregcv | cv=2 | 0.873737 | 0.790368 | 0.688889 | 0.870769 | 30.90 sec | 30.895569 |
3 | tfidf | svc | max_iter=200 | 0.855429 | 0.795302 | 0.585185 | 0.847565 | 0.14 sec | 0.135881 | |
0 | tfidf | logregcv | cv=2 | 0.849470 | 3.18 sec | 0.855429 | 0.775000 | 0.612346 | 3.175989 |
from sklearn.naive_bayes import GaussianNB, BernoulliNB
model = GaussianNB() # needs dense matrix
text_model_name = 'tfidf'
params = ''
model_name = 'gnb'
desc = ''
df_eval = get_model_evaluation(Xtr.A,ytr,Xvd.A,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
1 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 32.41 sec | 32.407605 |
2 | tfidf | Extra+Scaling | logregcv | cv=2 | 0.873737 | 0.790368 | 0.688889 | 0.870769 | 30.90 sec | 30.895569 |
3 | tfidf | svc | max_iter=200 | 0.855429 | 0.795302 | 0.585185 | 0.847565 | 0.14 sec | 0.135881 | |
0 | tfidf | logregcv | cv=2 | 0.849470 | 3.18 sec | 0.855429 | 0.775000 | 0.612346 | 3.175989 | |
4 | tfidf | gnb | 0.796086 | 0.621302 | 0.518519 | 0.789709 | 1.67 sec | 1.666834 |
model = BernoulliNB() # needs dense matrix
text_model_name = 'tfidf'
params = ''
model_name = 'bernoulli'
desc = ''
df_eval = get_model_evaluation(Xtr.A,ytr,Xvd.A,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
1 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 32.41 sec | 32.407605 |
2 | tfidf | Extra+Scaling | logregcv | cv=2 | 0.873737 | 0.790368 | 0.688889 | 0.870769 | 30.90 sec | 30.895569 |
3 | tfidf | svc | max_iter=200 | 0.855429 | 0.795302 | 0.585185 | 0.847565 | 0.14 sec | 0.135881 | |
0 | tfidf | logregcv | cv=2 | 0.849470 | 3.18 sec | 0.855429 | 0.775000 | 0.612346 | 3.175989 | |
5 | tfidf | bernoulli | 0.823232 | 0.804878 | 0.407407 | 0.801164 | 1.07 sec | 1.073401 | ||
4 | tfidf | gnb | 0.796086 | 0.621302 | 0.518519 | 0.789709 | 1.67 sec | 1.666834 |
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=1000,n_jobs=-1,random_state=SEED)
text_model_name = 'tfidf'
params = 'n_estimators=1000'
model_name = 'rf'
desc = ''
df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
1 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 32.41 sec | 32.407605 |
2 | tfidf | Extra+Scaling | logregcv | cv=2 | 0.873737 | 0.790368 | 0.688889 | 0.870769 | 30.90 sec | 30.895569 |
3 | tfidf | svc | max_iter=200 | 0.855429 | 0.795302 | 0.585185 | 0.847565 | 0.14 sec | 0.135881 | |
0 | tfidf | logregcv | cv=2 | 0.849470 | 3.18 sec | 0.855429 | 0.775000 | 0.612346 | 3.175989 | |
6 | tfidf | n_estimators=1000 | rf | 0.832071 | 0.849246 | 0.417284 | 0.810180 | 16.32 sec | 16.320531 | |
5 | tfidf | bernoulli | 0.823232 | 0.804878 | 0.407407 | 0.801164 | 1.07 sec | 1.073401 | ||
4 | tfidf | gnb | 0.796086 | 0.621302 | 0.518519 | 0.789709 | 1.67 sec | 1.666834 |
from sklearn.linear_model import SGDClassifier
# SGDClassifier?
model = SGDClassifier(n_jobs=-1,random_state=SEED)
text_model_name = 'tfidf'
params = ''
model_name = 'sgd'
desc = ''
df_eval = get_model_evaluation(Xtr,ytr,Xvd,yvd,
text_model_name,params,
model_name,desc,model,
scaling=False,
df_eval=df_eval)
Text Model | Params | Model | Description | F1 Weighted | Time Taken | Accuracy | Precision | Recall | Time Taken Sec | |
---|---|---|---|---|---|---|---|---|---|---|
1 | tfidf | Extra | logregcv | cv=2 | 0.876263 | 0.772846 | 0.730864 | 0.875107 | 32.41 sec | 32.407605 |
2 | tfidf | Extra+Scaling | logregcv | cv=2 | 0.873737 | 0.790368 | 0.688889 | 0.870769 | 30.90 sec | 30.895569 |
3 | tfidf | svc | max_iter=200 | 0.855429 | 0.795302 | 0.585185 | 0.847565 | 0.14 sec | 0.135881 | |
0 | tfidf | logregcv | cv=2 | 0.849470 | 3.18 sec | 0.855429 | 0.775000 | 0.612346 | 3.175989 | |
7 | tfidf | sgd | 0.848485 | 0.753846 | 0.604938 | 0.842669 | 0.05 sec | 0.048415 | ||
6 | tfidf | n_estimators=1000 | rf | 0.832071 | 0.849246 | 0.417284 | 0.810180 | 16.32 sec | 16.320531 | |
5 | tfidf | bernoulli | 0.823232 | 0.804878 | 0.407407 | 0.801164 | 1.07 sec | 1.073401 | ||
4 | tfidf | gnb | 0.796086 | 0.621302 | 0.518519 | 0.789709 | 1.67 sec | 1.666834 |
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, StratifiedKFold
scorer = metrics.make_scorer(metrics.f1_score,average='weighted')
skf = StratifiedKFold(n_splits=3,random_state=SEED,shuffle=True)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=SEED)
def plot_grid_param_C(df_grid):
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_grid['param_C'],
y=df_grid['mean_train_score'],
mode='lines+markers',
name='Train F1'))
fig.add_trace(go.Scatter(x=df_grid['param_C'],
y=df_grid['mean_test_score'],
mode='lines+markers',
name='CV F1'))
fig['layout']['title'] = 'Cross validation scores'
fig['layout']['title']['x'] = 0.5
fig['layout']['xaxis']['title'] = 'C : Hyperparameter'
fig['layout']['yaxis']['title'] = 'F1'
fig.show()
note = """
%%time
params = {'C': [0.001,0.01,0.1,1,10]}
model = LogisticRegression(random_state=SEED,max_iter=10_000)
grid = GridSearchCV(model,params,n_jobs=-1,scoring=scorer,
cv=rskf,return_train_score=True)
grid.fit(Xtr,ytr)
print(grid.best_score_, grid.best_params_)
cols = [ 'params', 'mean_test_score', 'std_test_score' ]
df_grid = pd.DataFrame(grid.cv_results_).sort_values('mean_test_score',ascending=False)
display(df_grid[cols].style.background_gradient(subset=['mean_test_score']))
plot_grid_param_C(df_grid)
0.8741918227069756 {'C': 10}
""";
note = """
%%time
params = {'C': [i/10 for i in range(60,80)]}
model = LogisticRegression(random_state=SEED,max_iter=10_000)
grid = GridSearchCV(model,params,n_jobs=-1,scoring=scorer,
cv=rskf,return_train_score=True)
grid.fit(Xtr,ytr)
print(grid.best_score_, grid.best_params_)
cols = [ 'params', 'mean_test_score', 'std_test_score' ]
df_grid = pd.DataFrame(grid.cv_results_).sort_values('mean_test_score',ascending=False)
display(df_grid[cols].style.background_gradient(subset=['mean_test_score']))
plot_grid_param_C(df_grid)
0.8753013175137699 {'C': 6.2}
"""
TfidfVectorizer(input='content',encoding='utf-8',decode_error='strict',strip_accents=None,lowercase=True,preprocessor=None,tokenizer=None,analyzer='word',stop_words=None,token_pattern='(?u)\\b\\w\\w+\\b',ngram_range=(1, 1),max_df=1.0,min_df=1,max_features=None,vocabulary=None,binary=False,dtype=<class 'numpy.float64'>,norm='l2',use_idf=True,smooth_idf=True, sublinear_tf=False,
)
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
%%time
pipe = Pipeline([
('col_selector', ColumnSelector(cols=(mce),drop_axis=True)),
('tfidf', TfidfVectorizer()),
('logreg', LogisticRegression(random_state=SEED,max_iter=1_000,C=6.3)),
])
pipe.fit(df_Xtrain,ytrain)
vd_preds = pipe.predict(df_Xvalid)
f1 = metrics.f1_score(vd_preds,yvalid,average='weighted')
print(f1)
0.8966844029633191 CPU times: user 292 ms, sys: 6.82 ms, total: 298 ms Wall time: 339 ms
%%time
note = """
params = {
'tfidf__norm': ('l1', 'l2'),
'tfidf__max_df': (0.25, 0.5, 0.75,1.0,2.0),
'tfidf__min_df': (0.25, 0.5, 0.75,1.0,2.0),
'tfidf__ngram_range': ((1, 1), (1, 2)),
}
pipe = Pipeline([
('col_selector', ColumnSelector(cols=(mce),drop_axis=True)),
('tfidf', TfidfVectorizer()),
('logreg', LogisticRegression(random_state=SEED,max_iter=1_000,C=6.3)),
])
grid = GridSearchCV(pipe,params,n_jobs=-1,scoring=scorer,cv=3,return_train_score=False)
grid.fit(df_Xtrain,ytrain)
print(grid.best_score_, grid.best_params_)
0.7303171363757338 {'tfidf__max_df': 0.5, 'tfidf__min_df': 0.25, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': 'l2'}
CPU times: user 16.4 s, sys: 943 ms, total: 17.4 s
Wall time: 28.7 s
"""
CPU times: user 2 µs, sys: 1 µs, total: 3 µs Wall time: 5.01 µs
# cols = [ 'params', 'mean_test_score', 'std_test_score' ]
# df_grid = pd.DataFrame(grid.cv_results_).sort_values('mean_test_score',ascending=False)
# df_grid[cols].head().style.background_gradient(subset=['mean_test_score'])
model = LogisticRegression(C=6.3, random_state=SEED, max_iter=1000)
vd_preds = cross_val_predict(model,Xvd,yvd,n_jobs=-1)
(yvd==0.0).sum()
1179
pd.crosstab(yvd,vd_preds,margins=True,normalize=False)
col_0 | 0.0 | 1.0 | All |
---|---|---|---|
row_0 | |||
0.0 | 1126 | 53 | 1179 |
1.0 | 174 | 231 | 405 |
All | 1300 | 284 | 1584 |
pd.crosstab(yvd,vd_preds,margins=False,normalize='columns')
col_0 | 0.0 | 1.0 |
---|---|---|
row_0 | ||
0.0 | 0.866154 | 0.18662 |
1.0 | 0.133846 | 0.81338 |
print(metrics.classification_report(yvd,vd_preds))
precision recall f1-score support 0.0 0.87 0.96 0.91 1179 1.0 0.81 0.57 0.67 405 accuracy 0.86 1584 macro avg 0.84 0.76 0.79 1584 weighted avg 0.85 0.86 0.85 1584