%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install -q git+https://github.com/amaiya/eli5@tfkeras_0_10_1
    !pip install -q neptune-client neptune-contrib
    !pip install -q scikit-plot

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.


import neptune
from neptunecontrib.api import log_table
from neptunecontrib.monitoring.keras import NeptuneMonitor


# use your real key and DELETE the cell

# neptune.init('bhishanpdl/twitter-sentiment-analysis','your_api_key')


# deep learning
import tensorflow as tf
import keras

from keras.utils import to_categorical
from keras.losses import  binary_crossentropy, categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense,Flatten
from keras.layers import Embedding
from keras.layers import Dropout,SpatialDropout1D
from keras.layers import LSTM,GRU,Bidirectional
from keras.layers import Conv1D
from keras.layers import MaxPooling1D,GlobalMaxPooling1D

import random
SEED = 100
tf.random.set_seed(SEED)
random.seed(SEED)

versions_dl = [(x.__name__,x.__version__) for x in [tf,keras]]
print(versions_dl)

[('tensorflow', '2.3.0'), ('keras', '2.4.3')]


import numpy as np
import pandas as pd

pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100

from pprint import pprint
import time
import sys
import re
from tqdm import tqdm
tqdm.pandas()

# nlp
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


versions_ds = [(x.__name__,x.__version__) for x in [np,pd,nltk]]
pprint(versions_ds)

[('numpy', '1.18.5'), ('pandas', '1.0.5'), ('nltk', '3.2.5')]


from sklearn import metrics

df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
})


target = 'label'
maincol = 'tweet'

p = 'https://github.com/bhishanpdl/Datasets/blob/master/AV_Hackathons/sentiment_analysis/processed/'
df_combined = pd.read_csv(p + 'df_combined_clean.csv?raw=true')

# we must convert list columns
df_combined['tweet_lst_clean'] = df_combined['tweet_lst_clean'].apply(eval)
df_combined['tweet_lst_clean_emoji'] = df_combined['tweet_lst_clean_emoji'].apply(eval)

df_train = df_combined[~df_combined[target].isnull()]
df_test = df_combined[df_combined[target].isnull()]

print(f"train : {df_train.shape}")
print(f"test : {df_test.shape}")
display(df_train.head(2).append(df_train.tail(2)))

train : (7920, 24)
test : (1953, 24)


display(df_test.head(2).append(df_test.tail(2)))


from sklearn.model_selection import train_test_split

target = 'label'

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_train, df_train[target],
    test_size=0.2, random_state=SEED, stratify=df_train[target])

y_train = ser_ytrain.to_numpy().ravel()
y_valid = ser_yvalid.to_numpy().ravel()

print(f"df_train   : {df_train.shape}\n")

print(f"df_Xtrain  : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")

print(f"df_Xvalid  : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")

print(f"df_test    : {df_test.shape}")
print(f"ser_ytest  : This does not exist.")

df_Xtrain.head(2)

df_train   : (7920, 24)

df_Xtrain  : (6336, 24)
ser_ytrain : (6336,)

df_Xvalid  : (1584, 24)
ser_yvalid : (1584,)

df_test    : (1953, 24)
ser_ytest  : This does not exist.


from sklearn import metrics

df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
})


mycol = 'tweet_clean'
mylstcol = 'tweet_lst_clean'


X_train = [i for i in df_Xtrain[mylstcol]]
X_valid = [i for i in df_Xvalid[mylstcol]]
X_test = [i for i in df_test[mylstcol]]


unq_words = set()
maxlen = 0

for lst in tqdm(X_train):
    unq_words.update(lst)
    maxlen = len(lst) if maxlen < len(lst) else maxlen

print(len(list(unq_words)))
print(maxlen)

100%|██████████| 6336/6336 [00:00<00:00, 782518.48it/s]

14432
35


from keras.preprocessing.text import Tokenizer

num_words = len(list(unq_words))
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)


X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)
X_test = tokenizer.texts_to_sequences(X_test)


type(X_train), X_train[0]

(list, [4, 1, 20, 4874, 4, 1, 20, 88, 136, 4875])


from keras.preprocessing import sequence

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_valid = sequence.pad_sequences(X_valid, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print(X_train.shape,X_valid.shape,X_test.shape)

(6336, 35) (1584, 35) (1953, 35)


type(X_train)

numpy.ndarray


neptune.create_experiment(
    name='gru_bgru', # put small name
    description='',
    tags = ['keras', 'gru','bgru'],
    upload_source_files=None
)

neptune.log_text('versions_dl', str(versions_dl))
neptune.log_text('versions_ds', str(versions_ds))

neptune.log_text('mycol', mycol)
neptune.log_text('text processing', 'tweet')

https://ui.neptune.ai/bhishanpdl/twitter-sentiment-analysis/e/TWITSENT-18


from keras.callbacks import EarlyStopping
from neptunecontrib.monitoring.keras import NeptuneMonitor

early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max',
                               monitor='val_acc', patience=10)
callbacks = [early_stopping,NeptuneMonitor()]


# parameters
PARAMS = {'epoch_nr': 5,
          'batch_size': 256,

          'lr': 0.001,
          'dropout': 0.2}

for k,v in PARAMS.items():
    neptune.log_metric(k,v)


model = Sequential()

# input_dim=num_words and output_dim=300
model.add(Embedding(num_words,300,
                    input_length=maxlen))

model.add(GRU(units=128,
               dropout=PARAMS['dropout'],
               recurrent_dropout=PARAMS['dropout'],
               return_sequences=True))

model.add(GRU(64,
               dropout=PARAMS['dropout'],
               recurrent_dropout=PARAMS['dropout'],
               return_sequences=False))

model.add(Dense(100,activation='relu'))

model.add(Dropout(PARAMS['dropout']))

model.add(Dense(1,activation='sigmoid'))

# for multiclass: dense=(num_classes,softmax) and loss=sparse_xentropy

model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=PARAMS['lr']),
              metrics=['accuracy'])

model.summary()

WARNING:tensorflow:Layer gru will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer gru_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 35, 300)           4329600   
_________________________________________________________________
gru (GRU)                    (None, 35, 128)           165120    
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                37248     
_________________________________________________________________
dense (Dense)                (None, 100)               6500      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
=================================================================
Total params: 4,538,569
Trainable params: 4,538,569
Non-trainable params: 0
_________________________________________________________________


%%time

history = model.fit(X_train, y_train,
                    validation_data=(X_valid, y_valid),
                    epochs=PARAMS['epoch_nr'],
                    batch_size=PARAMS['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )

Epoch 1/5
25/25 [==============================] - ETA: 0s - loss: 0.5079 - accuracy: 0.7505WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 8s 339ms/step - loss: 0.5079 - accuracy: 0.7505 - val_loss: 0.2851 - val_accuracy: 0.8801
Epoch 2/5
25/25 [==============================] - ETA: 0s - loss: 0.2159 - accuracy: 0.9078WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 7s 284ms/step - loss: 0.2159 - accuracy: 0.9078 - val_loss: 0.2486 - val_accuracy: 0.8939
Epoch 3/5
25/25 [==============================] - ETA: 0s - loss: 0.1165 - accuracy: 0.9569WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 7s 280ms/step - loss: 0.1165 - accuracy: 0.9569 - val_loss: 0.3202 - val_accuracy: 0.8857
Epoch 4/5
25/25 [==============================] - ETA: 0s - loss: 0.0600 - accuracy: 0.9809WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 7s 279ms/step - loss: 0.0600 - accuracy: 0.9809 - val_loss: 0.4085 - val_accuracy: 0.8775
Epoch 5/5
25/25 [==============================] - ETA: 0s - loss: 0.0318 - accuracy: 0.9910WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 7s 275ms/step - loss: 0.0318 - accuracy: 0.9910 - val_loss: 0.5065 - val_accuracy: 0.8605
CPU times: user 56.9 s, sys: 6.2 s, total: 1min 3s
Wall time: 44 s


valid_preds = model.predict_classes(X_valid)
valid_preds = valid_preds.squeeze().tolist()

text_model_name = "gru"
params = str(PARAMS)
model_name = ""
desc = ""

yvd = y_valid
vd_preds = valid_preds

acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')

row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]

df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])

log_table('df_eval', df_eval)
dic_results = {'acc':acc,
               'precision':pre,
               'recall':rec,
               'f1': f1}

for k,v in dic_results.items():
    print('valid_'+k, v)
    neptune.log_metric('valid_'+k, v)

display(df_eval)

WARNING:tensorflow:From <ipython-input-40-6565d0e9ffa3>:1: Sequential.predict_classes (from tensorflow.python.keras.engine.sequential) is deprecated and will be removed after 2021-01-01.
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
valid_acc 0.860479797979798
valid_precision 0.7421052631578947
valid_recall 0.6962962962962963
valid_f1 0.8589899903801318


%%time

model = Sequential()

model.add(Embedding(num_words,300,input_length=maxlen))

model.add(SpatialDropout1D(0.25))

model.add(Bidirectional(GRU(128,dropout=0.4,return_sequences = True)))

model.add(Bidirectional(GRU(64,dropout=0.5,return_sequences = False)))

model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=PARAMS['lr']),
              metrics=['accuracy'])

print(model.summary())

#=============== fitting the model ===================
history = model.fit(X_train, y_train,
                    validation_data=(X_valid, y_valid),
                    epochs=PARAMS['epoch_nr'],
                    batch_size=PARAMS['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )

#=============== model evaluation =====================
valid_preds = model.predict_classes(X_valid)
valid_preds = valid_preds.squeeze().tolist()

text_model_name = "bgru"
params = str(PARAMS)
model_name = ""
desc = ""

yvd = y_valid
vd_preds = valid_preds

acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')

row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]

df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])

log_table('df_eval', df_eval)
dic_results = {'acc':acc,
               'precision':pre,
               'recall':rec,
               'f1': f1}

for k,v in dic_results.items():
    print('valid_'+k, v)
    neptune.log_metric('valid_'+k, v)

display(df_eval)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, 35, 300)           4329600   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 35, 300)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 35, 256)           330240    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 128)               123648    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
=================================================================
Total params: 4,783,617
Trainable params: 4,783,617
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
25/25 [==============================] - ETA: 0s - loss: 0.4997 - accuracy: 0.7536WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 3s 116ms/step - loss: 0.4997 - accuracy: 0.7536 - val_loss: 0.2933 - val_accuracy: 0.8706
Epoch 2/5
25/25 [==============================] - ETA: 0s - loss: 0.2434 - accuracy: 0.8984WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 73ms/step - loss: 0.2434 - accuracy: 0.8984 - val_loss: 0.2538 - val_accuracy: 0.8984
Epoch 3/5
25/25 [==============================] - ETA: 0s - loss: 0.1380 - accuracy: 0.9463WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 74ms/step - loss: 0.1380 - accuracy: 0.9463 - val_loss: 0.2850 - val_accuracy: 0.8996
Epoch 4/5
25/25 [==============================] - ETA: 0s - loss: 0.0774 - accuracy: 0.9725WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 74ms/step - loss: 0.0774 - accuracy: 0.9725 - val_loss: 0.3532 - val_accuracy: 0.8813
Epoch 5/5
25/25 [==============================] - ETA: 0s - loss: 0.0458 - accuracy: 0.9850WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 72ms/step - loss: 0.0458 - accuracy: 0.9850 - val_loss: 0.4284 - val_accuracy: 0.8687
valid_acc 0.8686868686868687
valid_precision 0.7545219638242894
valid_recall 0.7209876543209877
valid_f1 0.8676920722375269

CPU times: user 19.8 s, sys: 825 ms, total: 20.6 s
Wall time: 18.2 s


%%time

model = Sequential()
model.add(Embedding(num_words,300,input_length=maxlen))

model.add(Dropout(0.2))
model.add(Conv1D(128,kernel_size=3,padding='same',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=PARAMS['lr']),
              metrics=['accuracy'])
print(model.summary())

#=============== fitting the model ===================
history = model.fit(X_train, y_train,
                    validation_data=(X_valid, y_valid),
                    epochs=PARAMS['epoch_nr'],
                    batch_size=PARAMS['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )

#=============== model evaluation =====================
valid_preds = model.predict_classes(X_valid)
valid_preds = valid_preds.squeeze().tolist()

text_model_name = "cnn"
params = str(PARAMS)
model_name = ""
desc = ""

yvd = y_valid
vd_preds = valid_preds

acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')

row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]

df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])

log_table('df_eval', df_eval)
dic_results = {'acc':acc,
               'precision':pre,
               'recall':rec,
               'f1': f1}

for k,v in dic_results.items():
    print('valid_'+k, v)
    neptune.log_metric('valid_'+k, v)


model_str = """

model = Sequential()
model.add(Embedding(num_words,300,input_length=maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(128,kernel_size=3,padding='same',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=PARAMS['lr']),
              metrics=['accuracy'])
"""

neptune.log_text('cnn_model', model_str)

display(df_eval)


%%time

model = Sequential()
model.add(Embedding(num_words,300,input_length=maxlen))

model.add(Conv1D(128,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(GRU(256,return_sequences=True))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=PARAMS['lr']),
              metrics=['accuracy'])
print(model.summary())

#=============== fitting the model ===================
history = model.fit(X_train, y_train,
                    validation_data=(X_valid, y_valid),
                    epochs=PARAMS['epoch_nr'],
                    batch_size=PARAMS['batch_size'],
                    verbose=1,
                    callbacks=callbacks
                    )

#=============== model evaluation =====================
valid_preds = model.predict_classes(X_valid)
valid_preds = valid_preds.squeeze().tolist()

text_model_name = "cnn+gru"
params = str(PARAMS)
model_name = ""
desc = ""

yvd = y_valid
vd_preds = valid_preds

acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')

row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]

df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])

log_table('df_eval', df_eval)
dic_results = {'acc':acc,
               'precision':pre,
               'recall':rec,
               'f1': f1}

for k,v in dic_results.items():
    print('valid_'+k, v)
    neptune.log_metric('valid_'+k, v)


model_str = """


"""

neptune.log_text('deeplr_model', model_str)

display(df_eval)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_6 (Embedding)      (None, 35, 300)           4329600   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 35, 128)           115328    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 17, 128)           0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 17, 128)           0         
_________________________________________________________________
gru_8 (GRU)                  (None, 17, 256)           296448    
_________________________________________________________________
dropout_4 (Dropout)          (None, 17, 256)           0         
_________________________________________________________________
flatten (Flatten)            (None, 4352)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               1114368   
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 257       
=================================================================
Total params: 5,856,001
Trainable params: 5,856,001
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
25/25 [==============================] - ETA: 0s - loss: 0.4687 - accuracy: 0.7726WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 75ms/step - loss: 0.4687 - accuracy: 0.7726 - val_loss: 0.2877 - val_accuracy: 0.8769
Epoch 2/5
25/25 [==============================] - ETA: 0s - loss: 0.1936 - accuracy: 0.9195WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 62ms/step - loss: 0.1936 - accuracy: 0.9195 - val_loss: 0.2872 - val_accuracy: 0.8864
Epoch 3/5
25/25 [==============================] - ETA: 0s - loss: 0.0832 - accuracy: 0.9705WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 63ms/step - loss: 0.0832 - accuracy: 0.9705 - val_loss: 0.3996 - val_accuracy: 0.8611
Epoch 4/5
25/25 [==============================] - ETA: 0s - loss: 0.0354 - accuracy: 0.9893WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 61ms/step - loss: 0.0354 - accuracy: 0.9893 - val_loss: 0.5082 - val_accuracy: 0.8674
Epoch 5/5
25/25 [==============================] - ETA: 0s - loss: 0.0139 - accuracy: 0.9946WARNING:tensorflow:Early stopping conditioned on metric `val_acc` which is not available. Available metrics are: loss,accuracy,val_loss,val_accuracy
25/25 [==============================] - 2s 63ms/step - loss: 0.0139 - accuracy: 0.9946 - val_loss: 0.6212 - val_accuracy: 0.8687
valid_acc 0.8686868686868687
valid_precision 0.738498789346247
valid_recall 0.7530864197530864
valid_f1 0.8691054749755264

CPU times: user 13.1 s, sys: 475 ms, total: 13.6 s
Wall time: 11.3 s

	index	id	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	['#fingerprint', '#Pregnancy', '#android', '#apps', '#beautiful', '#cute', '#health', '#igers', '#iphoneonly', '#iphonesia', '#iphone']	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	128	13	1	13	2	5	0	2	8.923077	0.039062	1.0	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	['#yay', '#Sony', '#Xperia', '#S', '#sonyexperias…']	#yay #Sony #Xperia #S #sonyexperias…	131	17	1	17	5	12	0	3	6.764706	0.091603	1.0	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
7918	7918	7919	Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/	[finaly, got, smart, pocket, wifi, stay, conected, anytimeanywhere, ipad, samsung, gadget]	finaly got smart pocket wifi stay conected anytimeanywhere ipad samsung gadget	['#smart', '#pocket', '#wifi', '#ipad', '#samsung', '#s3', '#gadget', '#']	#smart #pocket #wifi #ipad #samsung #s3 #gadget #	133	16	1	16	1	5	1	3	7.375000	0.037594	1.0	[finaly, got, smart, pocket, wi, fi, stay, conected, anytime, anywhere, ipad, samsung, gadget]	finaly got smart pocket wi fi stay conected anytime anywhere ipad samsung gadget
7919	7919	7920	Apple Barcelona!!! #Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew… http://instagram.com/p/wBApVzpCl3/	[aple, barcelona, aple, store, bcn, barcelona, travel, iphone, selfie, fly, fun, cabincrew]	aple barcelona aple store bcn barcelona travel iphone selfie fly fun cabincrew	['#Apple', '#Store', '#BCN', '#Barcelona', '#travel', '#iphone', '#selfie', '#fly', '#fun', '#cabincrew…']	#Apple #Store #BCN #Barcelona #travel #iphone #selfie #fly #fun #cabincrew…	129	13	1	13	5	12	3	2	9.000000	0.093023	1.0	[aple, barcelona, aple, store, n, barcelona, travel, iphone, self, ie, fly, fun, cabin, crew]	aple barcelona aple store n barcelona travel iphone self ie fly fun cabin crew

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_punctuation	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
7920	0	7921	NaN	I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks	[hate, new, iphone, upgrade, wil, let, download, aps, ugh, aple, suck]	hate new iphone upgrade wil let download aps ugh aple suck	['#iphone', '#ugh', '#apple']	#iphone #ugh #apple	77	14	1	14	1	2	2	4.571429	0.025974	1.000000	[hate, new, iphone, upgrade, wil, let, download, aps, ugh, aple, suck]	hate new iphone upgrade wil let download aps ugh aple suck
7921	1	7922	NaN	currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/	[curently, shiting, fucking, pant, aple, imac, cashmoney, radest, swagswag]	curently shiting fucking pant aple imac cashmoney radest swagswag	['#apple', '#iMac', '#cashmoney', '#raddest', '#swagswagswag']	#apple #iMac #cashmoney #raddest #swagswagswag	115	11	1	11	0	8	3	9.545455	0.069565	1.000000	[curently, shiting, fucking, pant, aple, imac, cash, money, rad, de, st, swag, wag, wag]	curently shiting fucking pant aple imac cash money rad de st swag wag wag
9871	1951	9872	NaN	@codeofinterest as i said #Adobe big time we may well as include #apple to	[codeofinterest, said, adobe, big, time, may, wel, include, aple]	codeofinterest said adobe big time may wel include aple	['#Adobe', '#apple']	#Adobe #apple	74	14	1	13	1	1	0	4.357143	0.013514	0.928571	[code, interest, said, adobe, big, time, may, wel, include, aple]	code interest said adobe big time may wel include aple
9872	1952	9873	NaN	Finally I got it .. thanx my father .. #Samsung #galaxy #s3 #gift #father #phone #new http://instagr.am/p/NoxkiPE	[finaly, got, thanx, father, samsung, galaxy, gift, father, phone, new]	finaly got thanx father samsung galaxy gift father phone new	['#Samsung', '#galaxy', '#s3', '#gift', '#father', '#phone', '#new']	#Samsung #galaxy #s3 #gift #father #phone #new	113	17	1	16	3	6	6	5.705882	0.053097	0.941176	[finaly, got, x, father, samsung, galaxy, gift, father, phone, new]	finaly got x father samsung galaxy gift father phone new

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	total_length	num_words	num_sent	num_unique_words	num_words_title	num_uppercase	num_exclamation_marks	num_question_marks	num_punctuation	num_symbols	num_digits	avg_word_len	avg_uppercase	avg_unique	tweet_lst_clean_emoji	tweet_clean_emoji
1257	1257	1258	0.0	new iphone case. #guerlain #new #iphone #case #iphone4s #4s #pink #lapetiterobenoir it… http://instagram.com/p/YTHkSuNtTE/	[new, iphone, case, guerlain, new, iphone, case, iphones, pink, lapetiterobenoir]	new iphone case guerlain new iphone case iphones pink lapetiterobenoir	['#guerlain', '#new', '#iphone', '#case', '#iphone4s', '#4s', '#pink', '#lapetiterobenoir']	#guerlain #new #iphone #case #iphone4s #4s #pink #lapetiterobenoir	122	13	1	13	0	7	0	0	3	0	0	8.461538	0.057377	1.000000	[new, iphone, case, gue, rla, new, iphone, case, iphone, pink, la, petite, robe, noir]	new iphone case gue rla new iphone case iphone pink la petite robe noir
1400	1400	1401	0.0	i really want an iPad for the sole reason that I just want one apple	[realy, want, ipad, sole, reason, want, one, aple]	realy want ipad sole reason want one aple	[]	NaN	68	15	1	14	1	2	0	0	0	0	0	3.600000	0.029412	0.933333	[realy, want, ipad, sole, reason, want, one, aple]	realy want ipad sole reason want one aple

Descriptions¶

Google Colab¶

Load the libraries¶

Load the data¶

Train valid split¶

Text Data Preparation¶

Unique words and sentence max length¶

Tokenization¶

Neptune¶

Modelling¶

Modelling: keras GRU¶

Visualize the model¶

Modelling: Bidirectional GRU¶

Modelling: CNN¶

Modelling: CNN + GRU¶

	Text Model	Params	Model	Description	Accuracy	Precision	Recall	F1
0	gru	{'epoch_nr': 5, 'batch_size': 256, 'lr': 0.001, 'dropout': 0.2}			0.860480	0.742105	0.696296	0.858990
1	bgru	{'epoch_nr': 5, 'batch_size': 256, 'lr': 0.001, 'dropout': 0.2}			0.868687	0.754522	0.720988	0.867692