%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install -q git+https://github.com/amaiya/eli5@tfkeras_0_10_1
    !pip install -q neptune-client neptune-contrib
    !pip install -q scikit-plot

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.

  Building wheel for eli5 (setup.py) ... done
     |████████████████████████████████| 92kB 4.0MB/s 
     |████████████████████████████████| 71kB 8.1MB/s 
     |████████████████████████████████| 829kB 16.9MB/s 
     |████████████████████████████████| 61kB 9.1MB/s 
     |████████████████████████████████| 204kB 29.5MB/s 
     |████████████████████████████████| 163kB 30.0MB/s 
     |████████████████████████████████| 143kB 36.7MB/s 
     |████████████████████████████████| 133kB 17.3MB/s 
     |████████████████████████████████| 71kB 9.7MB/s 
     |████████████████████████████████| 71kB 10.0MB/s 
  Building wheel for neptune-client (setup.py) ... done
  Building wheel for neptune-contrib (setup.py) ... done
  Building wheel for future (setup.py) ... done
  Building wheel for msgpack-python (setup.py) ... done
  Building wheel for strict-rfc3339 (setup.py) ... done
Environment: Google Colaboratory.


import neptune
from neptunecontrib.api import log_table
from neptunecontrib.monitoring.keras import NeptuneMonitor


# use your real key and DELETE the cell

# neptune.init('bhishanpdl/twitter-sentiment-analysis','your_api_key')


# deep learning
import tensorflow as tf
import keras
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import  binary_crossentropy, categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential

import random
SEED = 100
tf.random.set_seed(SEED)
random.seed(SEED)

versions_dl = [(x.__name__,x.__version__) for x in [tf,keras]]
print(versions_dl)

[('tensorflow', '2.3.0'), ('keras', '2.4.3')]


import numpy as np
import pandas as pd

pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100

from pprint import pprint
import time
import sys
import re
from tqdm import tqdm
tqdm.pandas()

# nlp
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


versions_ds = [(x.__name__,x.__version__) for x in [np,pd,nltk]]
pprint(versions_ds)

[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('nltk', '3.2.5')]


from sklearn import metrics

df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
})


target = 'label'
maincol = 'tweet'

p = 'https://github.com/bhishanpdl/Datasets/blob/master/AV_Hackathons/sentiment_analysis/processed/'
df_combined = pd.read_csv(p + 'df_combined_clean.csv?raw=true')

# we must convert list columns
df_combined['tweet_lst_clean'] = df_combined['tweet_lst_clean'].apply(eval)
df_combined['tweet_lst_clean_emoji'] = df_combined['tweet_lst_clean_emoji'].apply(eval)

df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]

print(f"train : {df_train.shape}")
print(f"test : {df_test.shape}")
display(df_train.head(2).append(df_train.tail(2)))

train : (7920, 24)
test : (1953, 24)


display(df_test.head(2).append(df_test.tail(2)))


from sklearn.model_selection import train_test_split

target = 'label'

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_train, df_train[target],
    test_size=0.2, random_state=SEED, stratify=df_train[target])

y_train = ser_ytrain.to_numpy().ravel()
y_valid = ser_yvalid.to_numpy().ravel()

print(f"df_train   : {df_train.shape}\n")

print(f"df_Xtrain  : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")

print(f"df_Xvalid  : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")

print(f"df_test    : {df_test.shape}")
print(f"ser_ytest  : This does not exist.")

df_Xtrain.head(2)

df_train   : (7920, 24)

df_Xtrain  : (6336, 24)
ser_ytrain : (6336,)

df_Xvalid  : (1584, 24)
ser_yvalid : (1584,)

df_test    : (1953, 24)
ser_ytest  : This does not exist.


from sklearn import metrics

df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
})


mycol = 'tweet_clean'
mylstcol = 'tweet_lst_clean'


X_train = [i for i in df_Xtrain[mylstcol]]
X_valid = [i for i in df_Xvalid[mylstcol]]
X_test = [i for i in df_test[mylstcol]]


unq_words = set()
maxlen = 0

for lst in tqdm(X_train):
    unq_words.update(lst)
    maxlen = len(lst) if maxlen < len(lst) else maxlen

print(len(list(unq_words)))
print(maxlen)

100%|██████████| 6336/6336 [00:00<00:00, 530780.34it/s]

14432
35


from keras.preprocessing.text import Tokenizer

num_words = len(list(unq_words))
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)


X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)
X_test = tokenizer.texts_to_sequences(X_test)


type(X_train), X_train[0]

(list, [4, 1, 20, 4874, 4, 1, 20, 88, 136, 4875])


from keras.preprocessing import sequence

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_valid = sequence.pad_sequences(X_valid, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print(X_train.shape,X_valid.shape,X_test.shape)

(6336, 35) (1584, 35) (1953, 35)


type(X_train)

numpy.ndarray


neptune.create_experiment(
    name='lstm', # put small name
    description='',
    tags = ['keras', 'lstm','nltk'],
    upload_source_files=None
)

neptune.log_text('versions_dl', str(versions_dl))
neptune.log_text('versions_ds', str(versions_ds))

neptune.log_text('mycol', mycol)
neptune.log_text('text processing', 'tweet')

https://ui.neptune.ai/bhishanpdl/twitter-sentiment-analysis/e/TWITSENT-17


from keras.callbacks import EarlyStopping
from neptunecontrib.monitoring.keras import NeptuneMonitor

early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max',
                               monitor='val_acc', patience=10)
callbacks = [early_stopping,NeptuneMonitor()]


# parameters
PARAMS = {'epoch_nr': 5,
          'batch_size': 256,
          'lr': 0.005,
          'momentum': 0.4,
          'dropout': 0.05}

for k,v in PARAMS.items():
    neptune.log_metric(k,v)


model=Sequential()

# input_dim=num_words and output_dim=300
model.add(Embedding(num_words,300,
                    input_length=maxlen))

model.add(LSTM(units=128,
               dropout=PARAMS['dropout'],
               recurrent_dropout=PARAMS['dropout'],
               return_sequences=True))

model.add(LSTM(64,
               dropout=PARAMS['dropout'],
               recurrent_dropout=PARAMS['dropout'],
               return_sequences=False))

model.add(Dense(100,activation='relu'))

model.add(Dropout(PARAMS['dropout']))

model.add(Dense(1,activation='sigmoid'))

# for multiclass: dense=(num_classes,softmax) and loss=sparse_xentropy

model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=PARAMS['lr']),
              metrics=['accuracy'])

model.summary()

WARNING:tensorflow:Layer lstm will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 35, 300)           4329600   
_________________________________________________________________
lstm (LSTM)                  (None, 35, 128)           219648    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 100)               6500      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
=================================================================
Total params: 4,605,257
Trainable params: 4,605,257
Non-trainable params: 0
_________________________________________________________________


%%time

history = model.fit(X_train, y_train,
                    validation_data=(X_valid, y_valid),
                    epochs=PARAMS['epoch_nr'],
                    batch_size=PARAMS['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )

Epoch 1/5
25/25 [==============================] - ETA: 0s - loss: 0.4243 - accuracy: 0.8093WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 9s 372ms/step - loss: 0.4243 - accuracy: 0.8093 - val_loss: 0.2776 - val_accuracy: 0.8725
Epoch 2/5
25/25 [==============================] - ETA: 0s - loss: 0.1593 - accuracy: 0.9388WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 8s 319ms/step - loss: 0.1593 - accuracy: 0.9388 - val_loss: 0.2852 - val_accuracy: 0.8756
Epoch 3/5
25/25 [==============================] - ETA: 0s - loss: 0.0765 - accuracy: 0.9740WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 8s 314ms/step - loss: 0.0765 - accuracy: 0.9740 - val_loss: 0.3776 - val_accuracy: 0.8744
Epoch 4/5
25/25 [==============================] - ETA: 0s - loss: 0.0382 - accuracy: 0.9880WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 8s 313ms/step - loss: 0.0382 - accuracy: 0.9880 - val_loss: 0.4319 - val_accuracy: 0.8611
Epoch 5/5
25/25 [==============================] - ETA: 0s - loss: 0.0203 - accuracy: 0.9942WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 8s 318ms/step - loss: 0.0203 - accuracy: 0.9942 - val_loss: 0.6384 - val_accuracy: 0.8605
CPU times: user 1min 4s, sys: 6.96 s, total: 1min 11s
Wall time: 49 s


valid_preds = model.predict_classes(X_valid)
valid_preds = valid_preds.squeeze().tolist()

WARNING:tensorflow:From <ipython-input-26-70fa072f681b>:1: Sequential.predict_classes (from tensorflow.python.keras.engine.sequential) is deprecated and will be removed after 2021-01-01.
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


text_model_name = "keras_lstm"
params = str(PARAMS)
model_name = ""
desc = ""

yvd = y_valid
vd_preds = valid_preds

acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')

row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]

df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])

df_eval


from neptunecontrib.api import log_table

log_table('df_eval', df_eval)

dic_results = {'acc':acc,
               'precision':pre,
               'recall':rec,
               'f1': f1}

for k,v in dic_results.items():
    print('valid_'+k, v)
    neptune.log_metric('valid_'+k, v)

valid_acc 0.860479797979798
valid_precision 0.7311557788944724
valid_recall 0.7185185185185186
valid_f1 0.8600781556478218


test_preds = model.predict_classes(X_test)
test_preds = test_preds.squeeze().tolist()
test_preds[:5]

[1, 1, 1, 1, 1]


df_test[target] = test_preds
df_sub = df_test[['id','label']]
df_sub.to_csv('sub_lstm.csv', index=False)
display(df_sub.head())


# upload the data and get the score
neptune.log_metric('test_f1', 0.837852647394422  )


best_so_far = """
lr = 0.005
f1 = 0.837852647394422

"""
neptune.log_text('best_so_far',
                 best_so_far)

	index	id	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone']	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	128	13	1	13	2	5	0	2	8.923077	0.039062	1.0	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']	#yay #Sony #Xperia #S #sonyexperias…	131	17	1	17	5	12	0	3	6.764706	0.091603	1.0	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
7918	7918	7919	Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/	[finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget]	finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget	['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#']	#smart #pocket #wifi #ipad #samsung #s3 #gadget #	133	16	1	16	1	5	1	3	7.375000	0.037594	1.0	[finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget]	finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget
7919	7919	7920	Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/	[aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew]	aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew	['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…']	#Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew…	129	13	1	13	5	12	3	2	9.000000	0.093023	1.0	[aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew]	aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
7920	0	7921	NaN	I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks	[hate, new, iphone, upgrade, wil, let, download, aps, ugh, aple, suck]	hate new iphone upgrade wil let download aps ugh aple suck	['#iphone', '#ugh', '#apple']	#iphone #ugh #apple	77	14	1	14	1	2	2	4.571429	0.025974	1.000000	[hate, new, iphone, upgrade, wil, let, download, aps, ugh, aple, suck]	hate new iphone upgrade wil let download aps ugh aple suck
7921	1	7922	NaN	currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/	[curently, shiting, fucking, pant, aple, imac, cashmoney, radest, swagswag]	curently shiting fucking pant aple imac cashmoney radest swagswag	['#apple', '#iMac', '#cashmoney', '#raddest', '#swagswagswag']	#apple #iMac #cashmoney #raddest #swagswagswag	115	11	1	11	0	8	3	9.545455	0.069565	1.000000	[curently, shiting, fucking, pant, aple, imac, cash, money, rad, de, st, swag, wag, wag]	curently shiting fucking pant aple imac cash money rad de st swag wag wag
9871	1951	9872	NaN	@codeofinterest as i said #Adobe big time we may well as include #apple to	[codeofinterest, said, adobe, big, time, may, wel, include, aple]	codeofinterest said adobe big time may wel include aple	['#Adobe', '#apple']	#Adobe #apple	74	14	1	13	1	1	0	4.357143	0.013514	0.928571	[code, interest, said, adobe, big, time, may, wel, include, aple]	code interest said adobe big time may wel include aple
9872	1952	9873	NaN	Finally I got it .. thanx my father .. #Samsung #galaxy #s3 #gift #father #phone #new http://instagr.am/p/NoxkiPE	[finaly, got, thanx, father, samsung, galaxy, gift, father, phone, new]	finaly got thanx father samsung galaxy gift father phone new	['#Samsung', '#galaxy', '#s3', '#gift', '#father', '#phone', '#new']	#Samsung #galaxy #s3 #gift #father #phone #new	113	17	1	16	3	6	6	5.705882	0.053097	0.941176	[finaly, got, x, father, samsung, galaxy, gift, father, phone, new]	finaly got x father samsung galaxy gift father phone new

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_question_marks	num_punctuation	num_symbols	num_digits	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
1257	1257	1258	0.0	new iphone case. #guerlain #new #iphone #case #iphone4s #4s #pink #lapetiterobenoir it… http://instagram.com/p/YTHkSuNtTE/	[new, iphone, case, guerlain, new, iphone, case, iphones, pink, lapetiterobenoir]	new iphone case guerlain new iphone case iphones pink lapetiterobenoir	['#guerlain', '#new', '#iphone', '#case', '#iphone4s', '#4s', '#pink', '#lapetiterobenoir']	#guerlain #new #iphone #case #iphone4s #4s #pink #lapetiterobenoir	122	13	1	13	0	7	0	0	3	0	0	8.461538	0.057377	1.000000	[new, iphone, case, gue, rla, new, iphone, case, iphone, pink, la, petite, robe, noir]	new iphone case gue rla new iphone case iphone pink la petite robe noir
1400	1400	1401	0.0	i really want an iPad for the sole reason that I just want one apple	[realy, want, ipad, sole, reason, want, one, aple]	realy want ipad sole reason want one aple	[]	NaN	68	15	1	14	1	2	0	0	0	0	0	3.600000	0.029412	0.933333	[realy, want, ipad, sole, reason, want, one, aple]	realy want ipad sole reason want one aple

	id	label
7920	7921	1
7921	7922	1
7922	7923	1
7923	7924	1
7924	7925	1

Descriptions¶

Google Colab¶

Load the libraries¶

Load the data¶

Train valid split¶

Text Data Preparation¶

Unique words and sentence max length¶

Tokenization¶

Neptune¶

Modelling¶

Modelling: keras LSTM¶

Model Evaluation¶

Submission¶