%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install -q git+https://github.com/amaiya/eli5@tfkeras_0_10_1
!pip install -q neptune-client neptune-contrib
!pip install -q scikit-plot
## print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
Building wheel for eli5 (setup.py) ... done |████████████████████████████████| 92kB 4.0MB/s |████████████████████████████████| 71kB 8.1MB/s |████████████████████████████████| 829kB 16.9MB/s |████████████████████████████████| 61kB 9.1MB/s |████████████████████████████████| 204kB 29.5MB/s |████████████████████████████████| 163kB 30.0MB/s |████████████████████████████████| 143kB 36.7MB/s |████████████████████████████████| 133kB 17.3MB/s |████████████████████████████████| 71kB 9.7MB/s |████████████████████████████████| 71kB 10.0MB/s Building wheel for neptune-client (setup.py) ... done Building wheel for neptune-contrib (setup.py) ... done Building wheel for future (setup.py) ... done Building wheel for msgpack-python (setup.py) ... done Building wheel for strict-rfc3339 (setup.py) ... done Environment: Google Colaboratory.
import neptune
from neptunecontrib.api import log_table
from neptunecontrib.monitoring.keras import NeptuneMonitor
# use your real key and DELETE the cell
# neptune.init('bhishanpdl/twitter-sentiment-analysis','your_api_key')
# deep learning
import tensorflow as tf
import keras
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import binary_crossentropy, categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential
import random
SEED = 100
tf.random.set_seed(SEED)
random.seed(SEED)
versions_dl = [(x.__name__,x.__version__) for x in [tf,keras]]
print(versions_dl)
[('tensorflow', '2.3.0'), ('keras', '2.4.3')]
import numpy as np
import pandas as pd
pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100
from pprint import pprint
import time
import sys
import re
from tqdm import tqdm
tqdm.pandas()
# nlp
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
versions_ds = [(x.__name__,x.__version__) for x in [np,pd,nltk]]
pprint(versions_ds)
[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('nltk', '3.2.5')]
from sklearn import metrics
df_eval = pd.DataFrame({
'Text Model': [],
'Params': [],
'Model': [],
'Description': [],
'Accuracy': [],
'Precision': [],
'Recall': [],
'F1': [],
})
target = 'label'
maincol = 'tweet'
p = 'https://github.com/bhishanpdl/Datasets/blob/master/AV_Hackathons/sentiment_analysis/processed/'
df_combined = pd.read_csv(p + 'df_combined_clean.csv?raw=true')
# we must convert list columns
df_combined['tweet_lst_clean'] = df_combined['tweet_lst_clean'].apply(eval)
df_combined['tweet_lst_clean_emoji'] = df_combined['tweet_lst_clean_emoji'].apply(eval)
df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]
print(f"train : {df_train.shape}")
print(f"test : {df_test.shape}")
display(df_train.head(2).append(df_train.tail(2)))
train : (7920, 24) test : (1953, 24)
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0.0 | #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone] | fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone | ['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone'] | #fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone | 128 | 13 | 1 | 13 | 2 | 5 | 0 | 0 | 2 | 0 | 0 | 8.923077 | 0.039062 | 1.0 | [fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone] | fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone |
1 | 1 | 2 | 0.0 | Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/ | [finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias] | finaly transparant silicon case thanks uncle yay sony xperia sonyexperias | ['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…'] | #yay #Sony #Xperia #S #sonyexperias… | 131 | 17 | 1 | 17 | 5 | 12 | 0 | 0 | 3 | 0 | 0 | 6.764706 | 0.091603 | 1.0 | [finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri] | finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri |
7918 | 7918 | 7919 | 0.0 | Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/ | [finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget] | finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget | ['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#'] | #smart #pocket #wifi #ipad #samsung #s3 #gadget # | 133 | 16 | 1 | 16 | 1 | 5 | 1 | 0 | 3 | 0 | 0 | 7.375000 | 0.037594 | 1.0 | [finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget] | finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget |
7919 | 7919 | 7920 | 0.0 | Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/ | [aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew] | aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew | ['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…'] | #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… | 129 | 13 | 1 | 13 | 5 | 12 | 3 | 0 | 2 | 0 | 0 | 9.000000 | 0.093023 | 1.0 | [aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew] | aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew |
display(df_test.head(2).append(df_test.tail(2)))
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7920 | 0 | 7921 | NaN | I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks | [hate, new, iphone, upgrade, wil, let, download, aps, ugh, aple, suck] | hate new iphone upgrade wil let download aps ugh aple suck | ['#iphone', '#ugh', '#apple'] | #iphone #ugh #apple | 77 | 14 | 1 | 14 | 1 | 2 | 0 | 0 | 2 | 0 | 0 | 4.571429 | 0.025974 | 1.000000 | [hate, new, iphone, upgrade, wil, let, download, aps, ugh, aple, suck] | hate new iphone upgrade wil let download aps ugh aple suck |
7921 | 1 | 7922 | NaN | currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/ | [curently, shiting, fucking, pant, aple, imac, cashmoney, radest, swagswag] | curently shiting fucking pant aple imac cashmoney radest swagswag | ['#apple', '#iMac', '#cashmoney', '#raddest', '#swagswagswag'] | #apple #iMac #cashmoney #raddest #swagswagswag | 115 | 11 | 1 | 11 | 0 | 8 | 0 | 0 | 3 | 0 | 0 | 9.545455 | 0.069565 | 1.000000 | [curently, shiting, fucking, pant, aple, imac, cash, money, rad, de, st, swag, wag, wag] | curently shiting fucking pant aple imac cash money rad de st swag wag wag |
9871 | 1951 | 9872 | NaN | @codeofinterest as i said #Adobe big time we may well as include #apple to | [codeofinterest, said, adobe, big, time, may, wel, include, aple] | codeofinterest said adobe big time may wel include aple | ['#Adobe', '#apple'] | #Adobe #apple | 74 | 14 | 1 | 13 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 4.357143 | 0.013514 | 0.928571 | [code, interest, said, adobe, big, time, may, wel, include, aple] | code interest said adobe big time may wel include aple |
9872 | 1952 | 9873 | NaN | Finally I got it .. thanx my father .. #Samsung #galaxy #s3 #gift #father #phone #new http://instagr.am/p/NoxkiPE | [finaly, got, thanx, father, samsung, galaxy, gift, father, phone, new] | finaly got thanx father samsung galaxy gift father phone new | ['#Samsung', '#galaxy', '#s3', '#gift', '#father', '#phone', '#new'] | #Samsung #galaxy #s3 #gift #father #phone #new | 113 | 17 | 1 | 16 | 3 | 6 | 0 | 0 | 6 | 0 | 0 | 5.705882 | 0.053097 | 0.941176 | [finaly, got, x, father, samsung, galaxy, gift, father, phone, new] | finaly got x father samsung galaxy gift father phone new |
from sklearn.model_selection import train_test_split
target = 'label'
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_train, df_train[target],
test_size=0.2, random_state=SEED, stratify=df_train[target])
y_train = ser_ytrain.to_numpy().ravel()
y_valid = ser_yvalid.to_numpy().ravel()
print(f"df_train : {df_train.shape}\n")
print(f"df_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test : {df_test.shape}")
print(f"ser_ytest : This does not exist.")
df_Xtrain.head(2)
df_train : (7920, 24) df_Xtrain : (6336, 24) ser_ytrain : (6336,) df_Xvalid : (1584, 24) ser_yvalid : (1584,) df_test : (1953, 24) ser_ytest : This does not exist.
index | id | label | tweet | tweet_lst_clean | tweet_clean | hashtags_lst | hashtags | total_length | num_words | num_sent | num_unique_words | num_words_title | num_uppercase | num_exclamation_marks | num_question_marks | num_punctuation | num_symbols | num_digits | avg_word_len | avg_uppercase | avg_unique | tweet_lst_clean_emoji | tweet_clean_emoji | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1257 | 1257 | 1258 | 0.0 | new iphone case. #guerlain #new #iphone #case #iphone4s #4s #pink #lapetiterobenoir it… http://instagram.com/p/YTHkSuNtTE/ | [new, iphone, case, guerlain, new, iphone, case, iphones, pink, lapetiterobenoir] | new iphone case guerlain new iphone case iphones pink lapetiterobenoir | ['#guerlain', '#new', '#iphone', '#case', '#iphone4s', '#4s', '#pink', '#lapetiterobenoir'] | #guerlain #new #iphone #case #iphone4s #4s #pink #lapetiterobenoir | 122 | 13 | 1 | 13 | 0 | 7 | 0 | 0 | 3 | 0 | 0 | 8.461538 | 0.057377 | 1.000000 | [new, iphone, case, gue, rla, new, iphone, case, iphone, pink, la, petite, robe, noir] | new iphone case gue rla new iphone case iphone pink la petite robe noir |
1400 | 1400 | 1401 | 0.0 | i really want an iPad for the sole reason that I just want one apple | [realy, want, ipad, sole, reason, want, one, aple] | realy want ipad sole reason want one aple | [] | NaN | 68 | 15 | 1 | 14 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 3.600000 | 0.029412 | 0.933333 | [realy, want, ipad, sole, reason, want, one, aple] | realy want ipad sole reason want one aple |
from sklearn import metrics
df_eval = pd.DataFrame({
'Text Model': [],
'Params': [],
'Model': [],
'Description': [],
'Accuracy': [],
'Precision': [],
'Recall': [],
'F1': [],
})
mycol = 'tweet_clean'
mylstcol = 'tweet_lst_clean'
X_train = [i for i in df_Xtrain[mylstcol]]
X_valid = [i for i in df_Xvalid[mylstcol]]
X_test = [i for i in df_test[mylstcol]]
unq_words = set()
maxlen = 0
for lst in tqdm(X_train):
unq_words.update(lst)
maxlen = len(lst) if maxlen < len(lst) else maxlen
print(len(list(unq_words)))
print(maxlen)
100%|██████████| 6336/6336 [00:00<00:00, 530780.34it/s]
14432 35
from keras.preprocessing.text import Tokenizer
num_words = len(list(unq_words))
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)
X_test = tokenizer.texts_to_sequences(X_test)
type(X_train), X_train[0]
(list, [4, 1, 20, 4874, 4, 1, 20, 88, 136, 4875])
from keras.preprocessing import sequence
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_valid = sequence.pad_sequences(X_valid, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print(X_train.shape,X_valid.shape,X_test.shape)
(6336, 35) (1584, 35) (1953, 35)
type(X_train)
numpy.ndarray
neptune.create_experiment(
name='lstm', # put small name
description='',
tags = ['keras', 'lstm','nltk'],
upload_source_files=None
)
neptune.log_text('versions_dl', str(versions_dl))
neptune.log_text('versions_ds', str(versions_ds))
neptune.log_text('mycol', mycol)
neptune.log_text('text processing', 'tweet')
https://ui.neptune.ai/bhishanpdl/twitter-sentiment-analysis/e/TWITSENT-17
from keras.callbacks import EarlyStopping
from neptunecontrib.monitoring.keras import NeptuneMonitor
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max',
monitor='val_acc', patience=10)
callbacks = [early_stopping,NeptuneMonitor()]
# parameters
PARAMS = {'epoch_nr': 5,
'batch_size': 256,
'lr': 0.005,
'momentum': 0.4,
'dropout': 0.05}
for k,v in PARAMS.items():
neptune.log_metric(k,v)
model=Sequential()
# input_dim=num_words and output_dim=300
model.add(Embedding(num_words,300,
input_length=maxlen))
model.add(LSTM(units=128,
dropout=PARAMS['dropout'],
recurrent_dropout=PARAMS['dropout'],
return_sequences=True))
model.add(LSTM(64,
dropout=PARAMS['dropout'],
recurrent_dropout=PARAMS['dropout'],
return_sequences=False))
model.add(Dense(100,activation='relu'))
model.add(Dropout(PARAMS['dropout']))
model.add(Dense(1,activation='sigmoid'))
# for multiclass: dense=(num_classes,softmax) and loss=sparse_xentropy
model.compile(loss='binary_crossentropy',
optimizer=Adam(lr=PARAMS['lr']),
metrics=['accuracy'])
model.summary()
WARNING:tensorflow:Layer lstm will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 35, 300) 4329600 _________________________________________________________________ lstm (LSTM) (None, 35, 128) 219648 _________________________________________________________________ lstm_1 (LSTM) (None, 64) 49408 _________________________________________________________________ dense (Dense) (None, 100) 6500 _________________________________________________________________ dropout (Dropout) (None, 100) 0 _________________________________________________________________ dense_1 (Dense) (None, 1) 101 ================================================================= Total params: 4,605,257 Trainable params: 4,605,257 Non-trainable params: 0 _________________________________________________________________
%%time
history = model.fit(X_train, y_train,
validation_data=(X_valid, y_valid),
epochs=PARAMS['epoch_nr'],
batch_size=PARAMS['batch_size'],
verbose=1,
callbacks=callbacks
)
Epoch 1/5 25/25 [==============================] - ETA: 0s - loss: 0.4243 - accuracy: 0.8093WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy 25/25 [==============================] - 9s 372ms/step - loss: 0.4243 - accuracy: 0.8093 - val_loss: 0.2776 - val_accuracy: 0.8725 Epoch 2/5 25/25 [==============================] - ETA: 0s - loss: 0.1593 - accuracy: 0.9388WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy 25/25 [==============================] - 8s 319ms/step - loss: 0.1593 - accuracy: 0.9388 - val_loss: 0.2852 - val_accuracy: 0.8756 Epoch 3/5 25/25 [==============================] - ETA: 0s - loss: 0.0765 - accuracy: 0.9740WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy 25/25 [==============================] - 8s 314ms/step - loss: 0.0765 - accuracy: 0.9740 - val_loss: 0.3776 - val_accuracy: 0.8744 Epoch 4/5 25/25 [==============================] - ETA: 0s - loss: 0.0382 - accuracy: 0.9880WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy 25/25 [==============================] - 8s 313ms/step - loss: 0.0382 - accuracy: 0.9880 - val_loss: 0.4319 - val_accuracy: 0.8611 Epoch 5/5 25/25 [==============================] - ETA: 0s - loss: 0.0203 - accuracy: 0.9942WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy 25/25 [==============================] - 8s 318ms/step - loss: 0.0203 - accuracy: 0.9942 - val_loss: 0.6384 - val_accuracy: 0.8605 CPU times: user 1min 4s, sys: 6.96 s, total: 1min 11s Wall time: 49 s
valid_preds = model.predict_classes(X_valid)
valid_preds = valid_preds.squeeze().tolist()
WARNING:tensorflow:From <ipython-input-26-70fa072f681b>:1: Sequential.predict_classes (from tensorflow.python.keras.engine.sequential) is deprecated and will be removed after 2021-01-01. Instructions for updating: Please use instead:* `np.argmax(model.predict(x), axis=-1)`, if your model does multi-class classification (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`, if your model does binary classification (e.g. if it uses a `sigmoid` last-layer activation).
text_model_name = "keras_lstm"
params = str(PARAMS)
model_name = ""
desc = ""
yvd = y_valid
vd_preds = valid_preds
acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')
row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])
df_eval
Text Model | Params | Model | Description | Accuracy | Precision | Recall | F1 | |
---|---|---|---|---|---|---|---|---|
0 | keras_lstm | {'epoch_nr': 5, 'batch_size': 256, 'lr': 0.005, 'momentum': 0.4, 'dropout': 0.05} | 0.86048 | 0.731156 | 0.718519 | 0.860078 |
from neptunecontrib.api import log_table
log_table('df_eval', df_eval)
dic_results = {'acc':acc,
'precision':pre,
'recall':rec,
'f1': f1}
for k,v in dic_results.items():
print('valid_'+k, v)
neptune.log_metric('valid_'+k, v)
valid_acc 0.860479797979798 valid_precision 0.7311557788944724 valid_recall 0.7185185185185186 valid_f1 0.8600781556478218
test_preds = model.predict_classes(X_test)
test_preds = test_preds.squeeze().tolist()
test_preds[:5]
[1, 1, 1, 1, 1]
df_test[target] = test_preds
df_sub = df_test[['id','label']]
df_sub.to_csv('sub_lstm.csv', index=False)
display(df_sub.head())
id | label | |
---|---|---|
7920 | 7921 | 1 |
7921 | 7922 | 1 |
7922 | 7923 | 1 |
7923 | 7924 | 1 |
7924 | 7925 | 1 |
# upload the data and get the score
neptune.log_metric('test_f1', 0.837852647394422 )
best_so_far = """
lr = 0.005
f1 = 0.837852647394422
"""
neptune.log_text('best_so_far',
best_so_far)