%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install bert-for-tf2
    !pip install sentencepiece

    ## print
    print('Environment: Google Colaboratory.')

# NOTE: If we update modules in gcolab, we need to restart runtime.


try:
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf
import tensorflow_hub as hub
import keras
from tensorflow.keras import layers
import bert

from pprint import pprint
pprint([(x.__name__,x.__version__) for x in [tf, hub, keras,bert]])

[('tensorflow', '2.3.0'),
 ('tensorflow_hub', '0.9.0'),
 ('keras', '2.4.3'),
 ('bert', '0.14.6')]


import numpy as np
import pandas as pd
from pprint import pprint

pd.set_option('max_colwidth',200)
pd.set_option('max_columns',200)
SEED = 100

import sys
import re
from tqdm import tqdm
tqdm.pandas()

pprint([(x.__name__,x.__version__) for x in [np,pd]])

[('numpy', '1.18.5'), ('pandas', '1.0.5')]


df = pd.read_csv('https://github.com/bhishanpdl/Datasets/blob/master/janatahack/sentiment_analysis/raw/train.csv?raw=true')

df = df.iloc[:1000] # to prevent OOM

print(f"train : {df.shape}")
display(df.head(2).append(df.tail(2)))

target = 'label'
maincol = 'tweet'

train : (1000, 3)


maincol = 'tweet'
target = 'label'
mc = maincol + '_clean'
tmc = 'tokenized_' + mc

'tokenized_tweet_clean'


import re
def process_text(text):
    out = re.sub(r'<[^>]+>','', text) # remove html tags
    out = re.sub('[^a-zA-Z]', ' ', out) # keep only alpha
    out = re.sub(r"\s+[a-zA-Z]\s+", ' ', out) # remove single letter
    out = re.sub(r'\s+', ' ', out) # remove multiple spaces

    return out

s = pd.Series(df[maincol][1])
s.progress_apply(process_text)

100%|██████████| 1/1 [00:00<00:00, 600.99it/s]

0    Finally transparant silicon case Thanks to my uncle yay Sony Xperia sonyexperias http instagram com YGEt JC JM 
dtype: object


df[mc] = df[maincol].progress_apply(process_text)

100%|██████████| 1000/1000 [00:00<00:00, 26317.70it/s]


df.head(2)


X = df[mc].to_list()
labels = df[target].to_list()


%%time
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)

vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()

to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

CPU times: user 9.12 s, sys: 1.71 s, total: 10.8 s
Wall time: 17.4 s


tokenizer.tokenize("nlp is fun.")

['nl', '##p', 'is', 'fun', '.']


tokenizer.convert_tokens_to_ids(tokenizer.tokenize("nlp is fun."))

[17953, 2361, 2003, 4569, 1012]


def encode_sentence(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))


df[tmc] = df[mc].progress_apply(encode_sentence)

100%|██████████| 1000/1000 [00:00<00:00, 3229.76it/s]


# shuffle the data
df = df.sample(frac=1,random_state=SEED)


df['length'] = df[mc].progress_apply(len)

100%|██████████| 1000/1000 [00:00<00:00, 123459.92it/s]


df.head(2)


# sort data with length
df = df.sort_values('length')
df.head(2)


sorted_data_labels = [ (df[tmc][i], df[target][i]) for i in range(len(df)) ]


sorted_data_labels[0][0][:5]

[4344, 16550, 10032, 3231, 16770]


# tf2 compatible dataset
processed_dataset = tf.data.Dataset.from_generator(
    lambda: sorted_data_labels, output_types=(tf.int32, tf.int32))


# pad each batch (better than padding whole dataset)
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))


# see the first batch
# next(iter(batched_dataset))


TOTAL_BATCHES = np.ceil(len(sorted_data_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10

batched_dataset.shuffle(TOTAL_BATCHES,seed=SEED)

test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)


class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)

        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output


# Hyperparameters
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 300
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.1

NB_EPOCHS = 10


text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)


if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])


%%time

# Note: use gpu 
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/10
29/29 [==============================] - 3s 87ms/step - loss: 0.4916 - accuracy: 0.7400
Epoch 2/10
29/29 [==============================] - 2s 84ms/step - loss: 0.2866 - accuracy: 0.9082
Epoch 3/10
29/29 [==============================] - 2s 86ms/step - loss: 0.1180 - accuracy: 0.9723
Epoch 4/10
29/29 [==============================] - 2s 86ms/step - loss: 0.0163 - accuracy: 0.9989
Epoch 5/10
29/29 [==============================] - 3s 86ms/step - loss: 0.0032 - accuracy: 1.0000
Epoch 6/10
29/29 [==============================] - 2s 86ms/step - loss: 0.0011 - accuracy: 1.0000
Epoch 7/10
29/29 [==============================] - 2s 85ms/step - loss: 6.4176e-04 - accuracy: 1.0000
Epoch 8/10
29/29 [==============================] - 2s 85ms/step - loss: 4.2553e-04 - accuracy: 1.0000
Epoch 9/10
29/29 [==============================] - 3s 86ms/step - loss: 3.1922e-04 - accuracy: 1.0000
Epoch 10/10
29/29 [==============================] - 2s 86ms/step - loss: 2.3658e-04 - accuracy: 1.0000
CPU times: user 45.3 s, sys: 989 ms, total: 46.3 s
Wall time: 26.7 s

<tensorflow.python.keras.callbacks.History at 0x7fa5f6dba4a8>


results = text_model.evaluate(test_data)
print(results)

3/3 [==============================] - 0s 7ms/step - loss: 0.4582 - accuracy: 0.8854
[0.458150714635849, 0.8854166865348816]


y_test = []
for element in test_data.as_numpy_iterator():
  y_test += element[1].tolist()

y_test[:5]

[0, 0, 0, 0, 1]


y_pred = text_model.predict(test_data)
y_pred = (y_pred > 0.5).astype('int').ravel().tolist()
y_pred[:5]

[0, 0, 0, 0, 1]


from sklearn import metrics


df_eval = pd.DataFrame({
    'Text Model': [],
    'Params': [],
    'Model': [],
    'Description': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
})


text_model_name = "bert"
params = "bert_en_uncased_L-12_H-768_A-12"
model_name = "tf"
desc = "3 layers"

yvd = y_test
vd_preds = y_pred

acc = metrics.accuracy_score(yvd,vd_preds)
pre = metrics.precision_score(yvd,vd_preds)
rec = metrics.recall_score(yvd,vd_preds)
f1 = metrics.f1_score(yvd,vd_preds,average='weighted')

row = [text_model_name, params, model_name,desc]
row = row + [acc, pre, rec, f1]

df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(subset=['Text Model', 'Params', 'Model', 'Description'])

df_eval

	id	label	tweet
0	1	0	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1	2	0	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
998	999	1	Idk if I should download Dead Nation and inFAMOUS and never play either one, or just not download anything #Sony #PSN #Playstation
999	1000	1	I've gone thru four iPhone chargers in 3 days #boss #badass

	id	label	tweet	tweet_clean	tokenized_tweet_clean	length
249	250	0	look what mommy got 4 me :-p #igers #ipad #apple #mom #gifts #instahub #instadaily #blessed #fo http://instagr.am/p/Uor2Uoh6C0/	look what mommy got me igers ipad apple mom gifts instahub instadaily blessed fo http instagr am Uor Uoh	[2298, 2054, 20565, 2288, 2033, 1045, 15776, 25249, 6207, 3566, 9604, 16021, 2696, 6979, 2497, 16021, 17713, 12502, 2100, 10190, 1042, 2080, 8299, 16021, 15900, 2099, 2572, 1057, 2953, 1057, 11631]	105
353	354	0	Top Photo App Available now >> https://itunes.apple.com/us/app/love360/id809353957?mt=8 … #photography #usa #woman poem #valentine #iphone #kärlek 872	Top Photo App Available now https itunes apple com us app love id mt photography usa woman poem valentine iphone rlek	[2327, 6302, 10439, 2800, 2085, 16770, 11943, 6207, 4012, 2149, 10439, 2293, 8909, 11047, 5855, 3915, 2450, 5961, 10113, 18059, 1054, 2571, 2243]	118

	id	label	tweet	tweet_clean	tokenized_tweet_clean	length
282	283	1	1 hour usage and phones down 13% ... Thanks #iPhone #Apple	hour usage and phones down Thanks iPhone Apple	[3178, 8192, 1998, 11640, 2091, 4283, 18059, 6207]	47
575	576	0	An awesome phablet ... note 5 duos. #Note7 note 5 ... #Samsung ...	An awesome phablet note duos Note note Samsung	[2019, 12476, 6887, 3085, 2102, 3602, 6829, 2015, 3602, 3602, 19102]	47

Descriptions¶

Google Colab¶

Load the libraries¶

Load the data¶

Text Data Processing¶

Tokenizing¶

Train Test Split¶

Create the model¶

Model Evaluation¶

	id	label	tweet	tweet_clean
0	1	0	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	fingerprint Pregnancy Test https goo gl MfQV android apps beautiful cute health igers iphoneonly iphonesia iphone
1	2	0	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	Finally transparant silicon case Thanks to my uncle yay Sony Xperia sonyexperias http instagram com YGEt JC JM